We will finally analyse this data properly.

After discussion with Amanda, we’ll be using the strictestRace/Geography assignments, see 06_revisiting_population_terms for more context on what this means.

# First we clean up the plotting labels:
allSRAFinal <- allSRAFinal %>% 
    mutate(strictestRace = gsub("or ", "or\n", strictestRace)) %>%
    mutate(strictestGeography = gsub("and ", "and\n", strictestGeography)) %>%
    mutate(strictestGeography = gsub("^Asia$", "Asia (NOS)", strictestGeography)) %>%
    mutate(strictestGeography = gsub("Subsaharan", "Sub-Saharan", strictestGeography)) %>%
    mutate(hispanic = gsub("^hispanic$", "Hispanic", hispanic)) %>%
    mutate(finalOrgan = gsub("^cancer$", "Cancer sample\n(NOS)", finalOrgan)) %>%
    mutate(finalSystem = gsub("^cancer$", "Cancer sample\n(NOS)", finalSystem)) %>%
    as.data.frame()

# For moving columns to upper case, because I forgot... 
firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

allSRAFinal$finalDisease <- firstup(allSRAFinal$finalDisease)
allSRAFinal$finalOrgan <- firstup(allSRAFinal$finalOrgan)
allSRAFinal$finalSystem <- firstup(allSRAFinal$finalSystem)

# Hispanic label trumps any other label
allSRAFinal$backupRace <- allSRAFinal$strictestRace
allSRAFinal$backupHispanic <- allSRAFinal$hispanic
allSRAFinal <- allSRAFinal %>% mutate(hispanic = na_if(hispanic, "non.hispanic"))
allSRAFinal$strictestRace <- coalesce(allSRAFinal$hispanic, allSRAFinal$strictestRace)
allSRAFinal$strictestRace <- factor(allSRAFinal$strictestRace)

# Now let's sort out some colour schemes... For Race we simply use Dark2 but we need to keep the mapping consistent, but for Geography we are following Alicia Martin and using a somewhat custom palette that doesn't follow anything standard, and then adding some new colours of our own; I opened a paper in Affinity and grabbed the html values and then added a couple for the new entries (Southeast Asia, Asia NOS):

allSRAFinal$strictestGeography <- factor(allSRAFinal$strictestGeography, levels = c("Sub-Saharan Africa", "North Africa and\nWestern Asia", "Europe", "South Asia", "Southeast Asia", "East Asia", "Asia (NOS)", "Oceania", "Americas", "Multiple", "Other"))

scale_fill_geography <- function(...){
    ggplot2:::manual_scale('fill',  
        values = setNames(c('#9C8DC3', '#F3D78A', '#DB6968', '#60BC55', '#BCCC45', '#4D97CD', '#04C3C8', '#C69C3A', '#F8984E', '#8B96AD', '#FBC9C4'), levels(allSRAFinal$strictestGeography)), 
    )
}

scale_color_geography <- function(...){
    ggplot2:::manual_scale('color',  
        values = setNames(c('#9C8DC3', '#F3D78A', '#DB6968', '#60BC55', '#BCCC45', '#4D97CD', '#04C3C8', '#C69C3A', '#F8984E', '#8B96AD', '#FBC9C4'), levels(allSRAFinal$strictestGeography)), 
    )
}

raceColours <- brewer.pal(8,"Set2") #8 because we don't plot NAs

scale_color_race <- function(...){
    ggplot2:::manual_scale('color', 
                           values = setNames(raceColours, levels(allSRAFinal$strictestRace)), 
    )
}

scale_fill_race <- function(...){
    ggplot2:::manual_scale('fill', 
        values = setNames(raceColours, levels(allSRAFinal$strictestRace)), 
    )
}

wberCols <- c(viridis::turbo(n = 7))
scale_fill_wber <- function(...){
    ggplot2:::manual_scale('fill', 
        values = setNames(wberCols, c("East Asia &\nPacific", "Europe &\nCentral Asia", "Latin America &\nCaribbean", "Middle East &\nNorth Africa", "North America", "South Asia", "Sub-Saharan Africa")), na.value = "grey50")
}


# We also define a couple of plot layouts:
long2Design <- "
  12
  12
"
wide2Design <- "
  11
  22
"

theme_set(theme_bw(base_size = 6))
theme_update(axis.text=element_text(size=7))
theme_update(legend.key.size = unit(12, 'pt'), #change legend key size
        legend.title = element_text(size=8), #change legend title font size
        legend.text = element_text(size=6),
        plot.title = element_text(size=8))
theme_update(plot.margin = unit(c(0, 0, 0, 0), "pt"))

We’re also going to use some additional info from the World Bank, so let’s add those columns in now:

worldBank <- read.csv("World_Bank_Descriptors.csv")

# Some names need to be fixed, but others are more flexible? I'd rather have USA than United States.
allSRAFinal$finalCountry <- gsub("^UK$", "United Kingdom", allSRAFinal$finalCountry) 
allSRAFinal$finalCountry <- gsub("^Korea$", "South Korea", allSRAFinal$finalCountry)

allSRAFinal$worldBank <- worldBank[match(allSRAFinal$finalCountry, worldBank$Economy),]$Income.group
# allSRAFinal %>% count(finalCountry, worldBank)

# Missing countries are all high income, so we can set them to that manually now that we're happy with the names:
allSRAFinal <- allSRAFinal %>% 
  mutate(worldBank = if_else(is.na(worldBank), "High income", worldBank)) %>%
  mutate(worldBank = if_else(is.na(finalCountry), NA, worldBank ))

# And now the regions they define:
allSRAFinal$worldRegion <- worldBank[match(allSRAFinal$finalCountry, worldBank$Economy),]$Region
# allSRAFinal %>% count(finalCountry, worldRegion)

# Once again we gotta add some manually...
allSRAFinal <- allSRAFinal %>% 
  mutate(worldRegion = if_else(grepl("Russia|Slovakia", finalCountry), "Europe & Central Asia", worldRegion)) %>%
  mutate(worldRegion = if_else(grepl("South Korea|Taiwan", finalCountry), "East Asia & Pacific", worldRegion)) %>%
  mutate(worldRegion = if_else(grepl("USA", finalCountry), "North America", worldRegion)) %>%
  as.data.frame()

# And now we need to clean up some labels so we can plot them better. Good thing is, it's just an ampersand:
allSRAFinal$worldRegion <- gsub("& ", "&\n", allSRAFinal$worldRegion)

1. Who is being sequenced?

geographyProp <- allSRAFinal %>% drop_na(strictestGeography) %>% count(strictestGeography) %>% mutate(freq = n/sum(n))
raceProp <- allSRAFinal %>% drop_na(strictestRace) %>% count(strictestRace) %>% mutate(freq = n/sum(n))

geoPlot <- ggplot(geographyProp, aes(x = fct_inorder(strictestGeography), y = n, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  ggtitle("Submitted descriptor\n(ancestral/geographic origin)") +
  xlab("") +
  ylab("Samples") +
  coord_flip() +
  scale_fill_geography() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  theme(legend.position="none")

geographyProp
##                strictestGeography    n        freq
## 1              Sub-Saharan Africa 1694 0.172417303
## 2  North Africa and\nWestern Asia   18 0.001832061
## 3                          Europe 5469 0.556641221
## 4                      South Asia  718 0.073078880
## 5                  Southeast Asia   49 0.004987277
## 6                       East Asia 1121 0.114096692
## 7                      Asia (NOS)  310 0.031552163
## 8                         Oceania   11 0.001119593
## 9                        Americas  166 0.016895674
## 10                       Multiple  207 0.021068702
## 11                          Other   62 0.006310433
sum(geographyProp$n)
## [1] 9825
racePlot <- ggplot(raceProp, aes(x = fct_inorder(strictestRace), y = n, fill=strictestRace)) +
  geom_bar(stat="identity") +
  ggtitle("Submitted descriptor\n(US Census term)") +
  xlab("") +
  ylab("Samples") +
  coord_flip() +
  scale_fill_race() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  theme(legend.position="none")

raceProp
##                                 strictestRace    n         freq
## 1          American Indian and\nAlaska Native   43 0.0030594095
## 2                                       Asian  944 0.0671647101
## 3                  Black or\nAfrican American 1742 0.1239416578
## 4                                    Hispanic 1255 0.0892920669
## 5                                    Multiple  203 0.0144432586
## 6 Native Hawaiian and\nother Pacific Islander    6 0.0004268943
## 7                                       Other  162 0.0115261473
## 8                                       White 9700 0.6901458556
sum(raceProp$n)
## [1] 14055
geoPlot + racePlot +
    plot_layout(design = long2Design)

ggsave("fig1_overall.pdf")
## Saving 7 x 5 in image
# And now with log transform on the axis:
geoPlot <- geoPlot +
  scale_y_continuous(trans='log10')
racePlot <- racePlot +
  scale_y_continuous(trans='log10')

geoPlot + racePlot +
    plot_layout(design = long2Design)

ggsave("fig1_overall_log10.pdf")
## Saving 7 x 5 in image
# There's 132 samples that do not have any info besides "Not Hispanic", which I think we should treat as NA, and therefore drop. The other option is to change them to race: Other and then move on with our lives. 
allSRAFinal %>% filter(is.na(allSRAFinal$strictestGeography) & is.na(allSRAFinal$strictestRace)) %>% count(ETHNICITY, RACE)
##                           ETHNICITY                              RACE  n
## 1 American indian or alaskan native                              <NA> 14
## 2                      Non-Hispanic                              <NA> 36
## 3            Not Hispanic or Latino           Unknown or Not Reported  4
## 4            Not Hispanic or Latino                              <NA> 92
## 5                              <NA> American Indian or Alaskan Native  2
allSRAFinal <- allSRAFinal %>% filter(!is.na(allSRAFinal$strictestGeography) | !is.na(allSRAFinal$strictestRace))

nrow(allSRAFinal[!is.na(allSRAFinal$strictestGeography) | !is.na(allSRAFinal$strictestRace),])
## [1] 23880
dim(allSRAFinal)
## [1] 23880    49

All samples accounted for now, at last!

So let’s get a bit more granular, showing n samples per study:

## Using SRA.Study, strictestGeography as id variables
## Using SRA.Study, strictestRace as id variables

Let’s get some meaningful statistics on this:

First we separate studies on the basis of whether they contain racial or geographic descriptors, but the hispanic usage introduces a new complexity… There’s a bunch of studies that now straddle the two categories. Could shift them to race like we did before, but better to just leave them alone for now? I am overthinking.

geographyClean <- allSRAFinal %>% drop_na(strictestGeography) 
raceClean <- allSRAFinal %>% drop_na(strictestRace)

geoSRA <- unique(geographyClean$SRA.Study)
length(geoSRA)
## [1] 135
raceSRA <- unique(raceClean$SRA.Study)
length(raceSRA)
## [1] 139
length(intersect(geoSRA, raceSRA)) 
## [1] 13
allSRAFinal %>% filter(grepl(paste(intersect(geoSRA, raceSRA), collapse="|"), SRA.Study)) %>%
  group_by(SRA.Study) %>%
  summarise(size = n(), studyJoint = n_distinct(SRA.Study)) 
## # A tibble: 13 Ă— 3
##    SRA.Study  size studyJoint
##    <chr>     <int>      <int>
##  1 DRP001797    42          1
##  2 SRP070663    12          1
##  3 SRP086245    42          1
##  4 SRP108559    32          1
##  5 SRP172694     6          1
##  6 SRP216947    36          1
##  7 SRP221484   195          1
##  8 SRP245400   345          1
##  9 SRP274641     6          1
## 10 SRP283115    17          1
## 11 SRP303641   100          1
## 12 SRP374111   100          1
## 13 SRP388678   492          1
allSRAFinal %>% filter(grepl(paste(intersect(geoSRA, raceSRA), collapse="|"), SRA.Study)) %>% nrow()
## [1] 1425
# Diversity by study:
geographyClean %>% group_by(SRA.Study) %>% 
  summarise(size = n(), studyGeo = n_distinct(strictestGeography)) %>%
  summarise(n = size, meanSize = mean(size), meanGeo = mean(studyGeo), maxSize = max(size), maxGeo = max(studyGeo), sdSize = sd(size), sdGeo = sd(studyGeo))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 135 Ă— 7
##        n meanSize meanGeo maxSize maxGeo sdSize sdGeo
##    <int>    <dbl>   <dbl>   <int>  <int>  <dbl> <dbl>
##  1    40     72.8    1.67     753      6   128.  1.15
##  2   464     72.8    1.67     753      6   128.  1.15
##  3   332     72.8    1.67     753      6   128.  1.15
##  4    51     72.8    1.67     753      6   128.  1.15
##  5   133     72.8    1.67     753      6   128.  1.15
##  6    66     72.8    1.67     753      6   128.  1.15
##  7    20     72.8    1.67     753      6   128.  1.15
##  8   159     72.8    1.67     753      6   128.  1.15
##  9    44     72.8    1.67     753      6   128.  1.15
## 10   181     72.8    1.67     753      6   128.  1.15
## # ℹ 125 more rows
raceClean %>% group_by(SRA.Study) %>% 
  summarise(size = n(), studyRace = n_distinct(strictestRace)) %>%
  summarise(n = size, meanSize = mean(size), meanRace = mean(studyRace), maxSize = max(size), maxRace = max(studyRace), sdSize = sd(size), sdRace = sd(studyRace))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 139 Ă— 7
##        n meanSize meanRace maxSize maxRace sdSize sdRace
##    <int>    <dbl>    <dbl>   <int>   <int>  <dbl>  <dbl>
##  1     2     101.     2.26    1857       8   213.   1.44
##  2    36     101.     2.26    1857       8   213.   1.44
##  3    13     101.     2.26    1857       8   213.   1.44
##  4    20     101.     2.26    1857       8   213.   1.44
##  5    95     101.     2.26    1857       8   213.   1.44
##  6    37     101.     2.26    1857       8   213.   1.44
##  7     9     101.     2.26    1857       8   213.   1.44
##  8    10     101.     2.26    1857       8   213.   1.44
##  9    12     101.     2.26    1857       8   213.   1.44
## 10   113     101.     2.26    1857       8   213.   1.44
## # ℹ 129 more rows
# By descriptor across all studies
geographyClean %>% group_by(SRA.Study) %>% count(strictestGeography) %>% group_by(strictestGeography) %>% 
  summarise(size = sum(n), studyGeo = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n)) 
## # A tibble: 11 Ă— 6
##    strictestGeography                size studyGeo  mean   max      sd
##    <fct>                            <int>    <int> <dbl> <int>   <dbl>
##  1 "Sub-Saharan Africa"              1694       42 40.3    355  70.7  
##  2 "North Africa and\nWestern Asia"    18        7  2.57     9   2.94 
##  3 "Europe"                          5469       87 62.9    753 109.   
##  4 "South Asia"                       718       13 55.2    365 102.   
##  5 "Southeast Asia"                    49        2 24.5     48  33.2  
##  6 "East Asia"                       1121       25 44.8    208  57.5  
##  7 "Asia (NOS)"                       310       25 12.4     66  16.6  
##  8 "Oceania"                           11        5  2.2      3   0.447
##  9 "Americas"                         166        9 18.4     39  14.3  
## 10 "Multiple"                         207        5 41.4    153  63.5  
## 11 "Other"                             62        6 10.3     41  15.4
raceClean %>% group_by(SRA.Study) %>% count(strictestRace) %>% group_by(strictestRace) %>% 
  summarise(size = sum(n), studyRace = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n)) 
## # A tibble: 8 Ă— 6
##   strictestRace                                  size studyRace  mean   max     sd
##   <fct>                                         <int>     <int> <dbl> <int>  <dbl>
## 1 "American Indian and\nAlaska Native"             43        10   4.3    18   5.38
## 2 "Asian"                                         944        47  20.1   164  29.9 
## 3 "Black or\nAfrican American"                   1742        74  23.5   177  30.9 
## 4 "Hispanic"                                     1255        50  25.1   562  79.8 
## 5 "Multiple"                                      203        13  15.6    49  16.5 
## 6 "Native Hawaiian and\nother Pacific Islander"     6         3   2       3   1   
## 7 "Other"                                         162        11  14.7    64  22.3 
## 8 "White"                                        9700       106  91.5  1005 162.

2. And by who? Where is the sequencing happening?

There are many ways to slice this, but some are harder to see than others, so here is the final best attempt…

## Using finalCountry, strictestGeography, worldRegion as id variables
## Warning: The `legend.title.align` argument of `theme()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use theme(legend.title = element_text(hjust)) instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Using finalCountry, strictestRace, worldRegion as id variables

## Saving 7 x 5 in image
## Using worldRegion, strictestGeography as id variables
## Using worldRegion, strictestRace as id variables

Some nice statistics about all of this:

# By study by world region:
geographyClean %>% group_by(worldRegion, SRA.Study) %>% 
  summarise(size = n(), studyGeo = n_distinct(strictestGeography)) %>%
  summarise(n = n(), meanSize = mean(size), meanGeo = mean(studyGeo), maxSize = max(size), maxGeo = max(studyGeo), sdSize = sd(size), sdGeo = sd(studyGeo))
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## # A tibble: 6 Ă— 8
##   worldRegion                      n meanSize meanGeo maxSize maxGeo sdSize sdGeo
##   <chr>                        <int>    <dbl>   <dbl>   <int>  <int>  <dbl> <dbl>
## 1 "East Asia &\nPacific"          32     44.6    1.31     238      5  59.2  0.931
## 2 "Europe &\nCentral Asia"        40     85.9    1.4      748      4 141.   0.810
## 3 "Latin America &\nCaribbean"     4     21      1         27      1   5.48 0    
## 4 "North America"                 54     89.1    2.17     753      6 152.   1.37 
## 5 "South Asia"                     2     13      2         14      3   1.41 1.41 
## 6  <NA>                            3     13.3    1         18      1   5.03 0
raceClean %>% group_by(worldRegion, SRA.Study) %>% 
  summarise(size = n(), studyRace = n_distinct(strictestRace)) %>%
  summarise(n = n(), meanSize = mean(size), meanRace = mean(studyRace), maxSize = max(size), maxRace = max(studyRace), sdSize = sd(size), sdRace = sd(studyRace))
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
## # A tibble: 4 Ă— 8
##   worldRegion                       n meanSize meanRace maxSize maxRace sdSize sdRace
##   <chr>                         <int>    <dbl>    <dbl>   <int>   <int>  <dbl>  <dbl>
## 1 "East Asia &\nPacific"           14     27.6     1.36     166       4   40.9  0.842
## 2 "Europe &\nCentral Asia"         14    180       2.43     772       5  319.   1.45 
## 3 "Middle East &\nNorth Africa"     1     11       1         11       1   NA   NA    
## 4 "North America"                 110    101.      2.36    1857       8  209.   1.47
# By descriptor by economic region
geographyClean %>% group_by(SRA.Study, worldRegion) %>% count(strictestGeography) %>% group_by(worldRegion, strictestGeography) %>% 
  summarise(size = sum(n), studyGeo = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n)) %>%
  as.data.frame()
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
##                   worldRegion             strictestGeography size studyGeo       mean max          sd
## 1        East Asia &\nPacific             Sub-Saharan Africa    9        2   4.500000   8   4.9497475
## 2        East Asia &\nPacific North Africa and\nWestern Asia    3        1   3.000000   3          NA
## 3        East Asia &\nPacific                         Europe  191        8  23.875000  61  21.1351941
## 4        East Asia &\nPacific                     South Asia   60        3  20.000000  44  21.6333077
## 5        East Asia &\nPacific                 Southeast Asia   49        2  24.500000  48  33.2340187
## 6        East Asia &\nPacific                      East Asia 1016       22  46.181818 208  60.6360229
## 7        East Asia &\nPacific                     Asia (NOS)  100        4  25.000000  39  13.4907376
## 8      Europe &\nCentral Asia             Sub-Saharan Africa  516       10  51.600000 129  42.8205558
## 9      Europe &\nCentral Asia                         Europe 2051       32  64.093750 375  91.6058737
## 10     Europe &\nCentral Asia                     South Asia  611        5 122.200000 365 148.2588952
## 11     Europe &\nCentral Asia                      East Asia   40        1  40.000000  40          NA
## 12     Europe &\nCentral Asia                     Asia (NOS)   23        3   7.666667  18   9.0737717
## 13     Europe &\nCentral Asia                       Americas   40        2  20.000000  39  26.8700577
## 14     Europe &\nCentral Asia                       Multiple  153        1 153.000000 153          NA
## 15     Europe &\nCentral Asia                          Other    3        2   1.500000   2   0.7071068
## 16 Latin America &\nCaribbean                       Americas   84        4  21.000000  27   5.4772256
## 17              North America             Sub-Saharan Africa 1169       30  38.966667 355  79.7541409
## 18              North America North Africa and\nWestern Asia   15        6   2.500000   9   3.2093613
## 19              North America                         Europe 3186       43  74.093023 753 131.8102558
## 20              North America                     South Asia   23        3   7.666667  14   5.5075705
## 21              North America                      East Asia   65        2  32.500000  63  43.1335137
## 22              North America                     Asia (NOS)  186       17  10.941176  66  17.7463749
## 23              North America                        Oceania   11        5   2.200000   3   0.4472136
## 24              North America                       Americas   42        3  14.000000  36  19.0787840
## 25              North America                       Multiple   54        4  13.500000  26  13.8684294
## 26              North America                          Other   59        4  14.750000  41  17.8021534
## 27                 South Asia                         Europe    1        1   1.000000   1          NA
## 28                 South Asia                     South Asia   24        2  12.000000  12   0.0000000
## 29                 South Asia                     Asia (NOS)    1        1   1.000000   1          NA
## 30                       <NA>                         Europe   40        3  13.333333  18   5.0332230
raceClean %>% group_by(SRA.Study, worldRegion) %>% count(strictestRace) %>% group_by(worldRegion, strictestRace) %>% 
  summarise(size = sum(n), studyRace = n_distinct(SRA.Study), mean = mean(n), max = max(n), sd = sd(n)) %>%
  as.data.frame()
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
##                    worldRegion                               strictestRace size studyRace      mean  max         sd
## 1         East Asia &\nPacific          American Indian and\nAlaska Native    3         1   3.00000    3         NA
## 2         East Asia &\nPacific                                       Asian  338        11  30.72727  164  44.884498
## 3         East Asia &\nPacific                  Black or\nAfrican American    2         1   2.00000    2         NA
## 4         East Asia &\nPacific                                    Hispanic   14         2   7.00000   12   7.071068
## 5         East Asia &\nPacific                                       White   29         4   7.25000   13   5.123475
## 6       Europe &\nCentral Asia          American Indian and\nAlaska Native    3         1   3.00000    3         NA
## 7       Europe &\nCentral Asia                                       Asian  194         4  48.50000   96  54.848276
## 8       Europe &\nCentral Asia                  Black or\nAfrican American  175         9  19.44444   80  25.884895
## 9       Europe &\nCentral Asia                                    Hispanic  167         5  33.40000   96  37.825917
## 10      Europe &\nCentral Asia                                       Other  142         3  47.33333   64  17.009801
## 11      Europe &\nCentral Asia                                       White 1839        12 153.25000  630 244.541547
## 12 Middle East &\nNorth Africa                                       White   11         1  11.00000   11         NA
## 13               North America          American Indian and\nAlaska Native   37         8   4.62500   18   6.045955
## 14               North America                                       Asian  412        32  12.87500   55  13.689530
## 15               North America                  Black or\nAfrican American 1565        64  24.45312  177  31.828873
## 16               North America                                    Hispanic 1074        43  24.97674  562  85.245161
## 17               North America                                    Multiple  203        13  15.61538   49  16.545586
## 18               North America Native Hawaiian and\nother Pacific Islander    6         3   2.00000    3   1.000000
## 19               North America                                       Other   20         8   2.50000    5   1.309307
## 20               North America                                       White 7821        89  87.87640 1005 151.443495
allSRAFinal %>% filter(!is.na(worldBank)) %>% count(worldBank) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##             worldBank     n        freq
## 1         High income 23266 0.975922819
## 2 Upper middle income   548 0.022986577
## 3 Lower middle income    26 0.001090604
allSRAFinal %>% filter(!is.na(finalCountry)) %>% count(finalCountry) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##      finalCountry     n         freq
## 1             USA 14804 0.6209731544
## 2         Belgium  2413 0.1012164430
## 3  United Kingdom  2018 0.0846476510
## 4          Canada  1144 0.0479865772
## 5          Sweden   540 0.0226510067
## 6           China   464 0.0194630872
## 7       Singapore   452 0.0189597315
## 8     South Korea   411 0.0172399329
## 9     Switzerland   278 0.0116610738
## 10         Taiwan   222 0.0093120805
## 11         Russia   166 0.0069630872
## 12          Japan   139 0.0058305369
## 13      Australia   126 0.0052852349
## 14        Denmark   124 0.0052013423
## 15          Italy    99 0.0041526846
## 16          Spain    91 0.0038171141
## 17        Germany    78 0.0032718121
## 18         Brazil    57 0.0023909396
## 19        Hungary    54 0.0022651007
## 20         Poland    34 0.0014261745
## 21         Greece    28 0.0011744966
## 22         Mexico    27 0.0011325503
## 23          India    26 0.0010906040
## 24       Slovakia    23 0.0009647651
## 25         Israel    11 0.0004614094
## 26        Austria     8 0.0003355705
## 27         France     3 0.0001258389
allSRAFinal %>% filter(!is.na(strictestGeography)) %>% count(worldBank) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##             worldBank    n        freq
## 1         High income 9332 0.949821883
## 2 Upper middle income  427 0.043460560
## 3                <NA>   40 0.004071247
## 4 Lower middle income   26 0.002646310
allSRAFinal %>% filter(!is.na(strictestRace)) %>% count(worldBank) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##             worldBank     n        freq
## 1         High income 13934 0.991390964
## 2 Upper middle income   121 0.008609036
allSRAFinal %>% filter(!is.na(strictestGeography)) %>% count(finalCountry) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##      finalCountry    n         freq
## 1             USA 3738 0.3804580153
## 2  United Kingdom 1951 0.1985750636
## 3          Canada 1072 0.1091094148
## 4          Sweden  540 0.0549618321
## 5     South Korea  411 0.0418320611
## 6           China  343 0.0349109415
## 7       Singapore  286 0.0291094148
## 8     Switzerland  278 0.0282951654
## 9          Taiwan  222 0.0225954198
## 10      Australia  126 0.0128244275
## 11        Denmark  124 0.0126208651
## 12         Russia  118 0.0120101781
## 13          Italy   87 0.0088549618
## 14          Spain   79 0.0080407125
## 15        Germany   72 0.0073282443
## 16         Brazil   57 0.0058015267
## 17        Belgium   56 0.0056997455
## 18        Hungary   54 0.0054961832
## 19          Japan   40 0.0040712468
## 20           <NA>   40 0.0040712468
## 21         Greece   28 0.0028498728
## 22         Mexico   27 0.0027480916
## 23          India   26 0.0026463104
## 24         Poland   24 0.0024427481
## 25       Slovakia   23 0.0023409669
## 26         France    3 0.0003053435
allSRAFinal %>% filter(!is.na(strictestRace)) %>% count(finalCountry) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##      finalCountry     n         freq
## 1             USA 11066 0.7873354678
## 2         Belgium  2357 0.1676983280
## 3       Singapore   166 0.0118107435
## 4           China   121 0.0086090359
## 5           Japan    99 0.0070437567
## 6          Canada    72 0.0051227321
## 7  United Kingdom    67 0.0047669868
## 8          Russia    48 0.0034151547
## 9           Italy    12 0.0008537887
## 10          Spain    12 0.0008537887
## 11         Israel    11 0.0007826396
## 12         Poland    10 0.0007114906
## 13        Austria     8 0.0005691925
## 14        Germany     6 0.0004268943
allSRAFinal %>% filter(!is.na(strictestGeography)) %>% count(worldRegion) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##                  worldRegion    n        freq
## 1              North America 4810 0.489567430
## 2     Europe &\nCentral Asia 3437 0.349821883
## 3       East Asia &\nPacific 1428 0.145343511
## 4 Latin America &\nCaribbean   84 0.008549618
## 5                       <NA>   40 0.004071247
## 6                 South Asia   26 0.002646310
allSRAFinal %>% filter(!is.na(strictestRace)) %>% count(worldRegion) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##                   worldRegion     n         freq
## 1               North America 11138 0.7924581999
## 2      Europe &\nCentral Asia  2520 0.1792956243
## 3        East Asia &\nPacific   386 0.0274635361
## 4 Middle East &\nNorth Africa    11 0.0007826396
allSRAFinal %>% count(worldRegion) %>% mutate(freq = n/sum(n)) %>% arrange(desc(freq))
##                   worldRegion     n         freq
## 1               North America 15948 0.6678391960
## 2      Europe &\nCentral Asia  5957 0.2494556114
## 3        East Asia &\nPacific  1814 0.0759631491
## 4  Latin America &\nCaribbean    84 0.0035175879
## 5                        <NA>    40 0.0016750419
## 6                  South Asia    26 0.0010887772
## 7 Middle East &\nNorth Africa    11 0.0004606365
# And finally... are studies from certain regions bigger than others?
NAEurSizes <- allSRAFinal %>% filter(worldRegion == "North America" | worldRegion == "Europe") %>%
  count(SRA.Study)

restWorldSizes <- allSRAFinal %>% filter(worldRegion != "North America" & worldRegion != "Europe") %>%
  count(SRA.Study)

t.test(restWorldSizes$n, NAEurSizes$n)
## 
##  Welch Two Sample t-test
## 
## data:  restWorldSizes$n and NAEurSizes$n
## t = -1.3275, df = 251.9, p-value = 0.1855
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -72.20593  14.05916
## sample estimates:
## mean of x mean of y 
##   75.1619  104.2353
allStudySizes <- allSRAFinal %>% group_by(SRA.Study) %>% 
  summarise(sampleSize = n(), region = worldRegion) %>% 
  distinct()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'SRA.Study'. You can override using the `.groups` argument.
lm(sampleSize ~ region, data = allStudySizes)
## 
## Call:
## lm(formula = sampleSize ~ region, data = allStudySizes)
## 
## Coefficients:
##                       (Intercept)       regionEurope &\nCentral Asia   regionLatin America &\nCaribbean  regionMiddle East &\nNorth Africa                regionNorth America  
##                             40.31                              72.09                             -19.31                             -29.31                              63.92  
##                  regionSouth Asia  
##                            -27.31
anova(lm(sampleSize ~ region, data = allStudySizes))
## Analysis of Variance Table
## 
## Response: sampleSize
##            Df  Sum Sq Mean Sq F value Pr(>F)
## region      5  204346   40869  1.2516 0.2855
## Residuals 252 8228480   32653
# Note that there's an issue here - although we filtered for studies with more than 10 entries, not all of those will have 10 samples with descriptors, which explains why these numbers are smaller. So long as we are consistent across the two datasets, it should be ok. 

Maybe worth considering some alluvial plots? At the single country level they’re very messy, however, so what about at the World Bank region level?

allSRAFinal %>% count(strictestGeography, worldRegion) %>% drop_na(c(strictestGeography, worldRegion)) %>%
  ggplot(data = .,
         aes(axis1 = worldRegion, axis2 = strictestGeography, y = n)) +
    scale_x_discrete(limits = c("SRA Depositor\nRegion", "Population\nDescriptor"), expand = c(.2, .05)) +
    geom_alluvium(aes(fill = strictestGeography)) +
    scale_fill_geography(name="Population\nDescriptor") +
    geom_stratum(width=1/3) +
    ylab("Samples") +
    geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
    theme_minimal(base_size = 6) +
    theme(legend.position = "none")
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.

allSRAFinal %>% count(strictestRace, worldRegion) %>% drop_na(c(strictestRace, worldRegion)) %>%
  ggplot(data = .,
         aes(axis1 = worldRegion, axis2 = strictestRace, y = n)) +
    scale_x_discrete(limits = c("SRA Depositor\nRegion", "US Census Racial Term"), expand = c(.2, .05)) +
    geom_alluvium(aes(fill = strictestRace)) +
    scale_fill_race(name="US Census\nTerm)") +
    geom_stratum(width=1/3) +
    ylab("Samples") +
    # geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
    geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
    theme_minimal(base_size = 6) +
    theme(legend.position = "none")

### And finally... does it have any kind of description??
allSRAFinal <- allSRAFinal %>% 
  mutate(hasDescriptor = if_else(is.na(strictestRace), "Geographic", "Racial"))

# Easiest to see with a boxplot?
descriptorUsePlot <- ggplot(allSRAFinal, aes(x = finalCountry, fill = hasDescriptor)) +
  geom_bar(position = "fill") +
  ggtitle("") +
  xlab("SRA depositor country") +
  ylab("Proportion of samples") +
  coord_fixed(ratio=6) +
  guides(fill=guide_legend(title="Descriptor\ntype")) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

descriptorUsePlot

ggsave("fig2_descriptor_use.pdf")
## Saving 7 x 5 in image
descriptorUseRegionPlot <- ggplot(allSRAFinal, aes(x = worldRegion, fill = hasDescriptor)) +
  geom_bar(position = "fill") +
  ggtitle("") +
  xlab("SRA depositor region") +
  ylab("Proportion of samples") +
  guides(fill=guide_legend(title="Descriptor\ntype")) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

descriptorUseRegionPlot

ggsave("fig2_descriptor_use_region_plot.pdf")
## Saving 7 x 5 in image
descriptorUseRegionAlluvial <- allSRAFinal %>% count(hasDescriptor, worldRegion) %>% drop_na(c(hasDescriptor, worldRegion)) %>%
  ggplot(data = .,
         aes(axis1 = worldRegion, axis2 = hasDescriptor, y = n)) +
    scale_x_discrete(limits = c("SRA Depositor\nRegion", "Population\nDescriptor"), expand = c(.2, .05)) +
    geom_alluvium(aes(fill = hasDescriptor)) +
    geom_stratum(width=1/3) +
    ylab("Samples") +
    geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
    theme_minimal(base_size = 6) +
    theme(legend.position = "none")

descriptorUseRegionAlluvial

ggsave("fig2_descriptor_use_region_alluvial.pdf")
## Saving 7 x 5 in image

And now, some statistics from the figures above:

allSRAFinal %>% count(finalCountry, hasDescriptor) %>% 
  group_by(finalCountry) %>%
  mutate(percent = 100 * n/sum(n)) %>%
  ungroup %>%
  as.data.frame()
##      finalCountry hasDescriptor     n    percent
## 1       Australia    Geographic   126 100.000000
## 2         Austria        Racial     8 100.000000
## 3         Belgium    Geographic    56   2.320763
## 4         Belgium        Racial  2357  97.679237
## 5          Brazil    Geographic    57 100.000000
## 6          Canada    Geographic  1072  93.706294
## 7          Canada        Racial    72   6.293706
## 8           China    Geographic   343  73.922414
## 9           China        Racial   121  26.077586
## 10        Denmark    Geographic   124 100.000000
## 11         France    Geographic     3 100.000000
## 12        Germany    Geographic    72  92.307692
## 13        Germany        Racial     6   7.692308
## 14         Greece    Geographic    28 100.000000
## 15        Hungary    Geographic    54 100.000000
## 16          India    Geographic    26 100.000000
## 17         Israel        Racial    11 100.000000
## 18          Italy    Geographic    87  87.878788
## 19          Italy        Racial    12  12.121212
## 20          Japan    Geographic    40  28.776978
## 21          Japan        Racial    99  71.223022
## 22         Mexico    Geographic    27 100.000000
## 23         Poland    Geographic    24  70.588235
## 24         Poland        Racial    10  29.411765
## 25         Russia    Geographic   118  71.084337
## 26         Russia        Racial    48  28.915663
## 27      Singapore    Geographic   286  63.274336
## 28      Singapore        Racial   166  36.725664
## 29       Slovakia    Geographic    23 100.000000
## 30    South Korea    Geographic   411 100.000000
## 31          Spain    Geographic    79  86.813187
## 32          Spain        Racial    12  13.186813
## 33         Sweden    Geographic   540 100.000000
## 34    Switzerland    Geographic   278 100.000000
## 35         Taiwan    Geographic   222 100.000000
## 36            USA    Geographic  3738  25.249932
## 37            USA        Racial 11066  74.750068
## 38 United Kingdom    Geographic  1951  96.679881
## 39 United Kingdom        Racial    67   3.320119
## 40           <NA>    Geographic    40 100.000000
allSRAFinal %>% filter(!grepl('USA', finalCountry)) %>% 
  count(hasDescriptor) %>% 
  mutate(freq = n/sum(n))
##   hasDescriptor    n      freq
## 1    Geographic 6087 0.6706699
## 2        Racial 2989 0.3293301
allSRAFinal %>% filter(grepl('USA', finalCountry)) %>% 
  count(hasDescriptor) %>% 
  mutate(freq = n/sum(n))
##   hasDescriptor     n      freq
## 1    Geographic  3738 0.2524993
## 2        Racial 11066 0.7475007
descriptorChi <- full_join((allSRAFinal %>% filter(!grepl('USA', finalCountry)) %>% 
  count(hasDescriptor) %>% 
  mutate(freq = n/sum(n))),
  (allSRAFinal %>% filter(grepl('USA', finalCountry)) %>% 
  count(hasDescriptor) %>% 
  mutate(freq = n/sum(n))), by = "hasDescriptor", suffix = c("noUSA", "USA"))

chisq.test(descriptorChi[,c(2,4)])
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  descriptorChi[, c(2, 4)]
## X-squared = 4061.3, df = 1, p-value < 2.2e-16
allSRAFinal %>% count(hasDescriptor) %>% 
  mutate(freq = n/sum(n))
##   hasDescriptor     n      freq
## 1    Geographic  9825 0.4114322
## 2        Racial 14055 0.5885678
# We also want to know the mean sample size of studies in the USA vs those not in the USA, for funsies
usaSizes <- allSRAFinal %>% filter(grepl('USA', finalCountry)) %>%
  group_by(SRA.Study) %>%
  summarise(n = n())
mean(usaSizes$n)
## [1] 103.5245
nousaSizes <- allSRAFinal %>% filter(!grepl('USA', finalCountry)) %>%
  group_by(SRA.Study) %>%
  summarise(n = n())
mean(nousaSizes$n)
## [1] 76.91525
t.test(usaSizes$n, nousaSizes$n)
## 
##  Welch Two Sample t-test
## 
## data:  usaSizes$n and nousaSizes$n
## t = 1.2117, df = 258.91, p-value = 0.2267
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -16.63314  69.85159
## sample estimates:
## mean of x mean of y 
## 103.52448  76.91525

3. What tissues are they sequencing?

As above, we start by looking simply at the country/region where sequencing is happening. A quick straw poll suggested finalOrgan was more interpretable than finalSystem, so I’m sticking with organ. Will have to work out how to do plots of disease and tissue with all the missing values, but I think we’re getting closer

# First we focus on population descriptors:
geographySummary <- allSRAFinal %>% count(strictestGeography, finalCountry, worldRegion, finalOrgan, finalDisease)
raceSummary <- allSRAFinal %>% count(strictestRace, finalCountry, worldRegion, finalOrgan, finalDisease)

geoOrgan <- geographySummary %>% group_by(finalOrgan, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestGeography)) %>% as.data.frame %>%
# geoOrgan <- meltGeography %>% drop_na(c(finalOrgan, strictestGeography)) %>%
  ggplot(., aes(x = fct_rev(finalOrgan), y = value, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  xlab("") +
  ylab("Samples") +
  coord_flip() +
  scale_fill_geography(name="Population\nDescriptor") +
  guides(fill=guide_legend(title="", label.position="left", ncol=1)) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
raceOrgan <- raceSummary %>% group_by(finalOrgan, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestRace)) %>% as.data.frame %>%
# raceOrgan <- meltRace %>% drop_na(c(finalOrgan, strictestRace)) %>%
  ggplot(., aes(x = fct_rev(finalOrgan), y = value, fill = strictestRace)) +
  geom_bar(stat="identity") +
  xlab("Sampled tissue") +
  ylab("Samples") +
  coord_flip() +
  scale_fill_race(name="US Census\nTerm)") +
  guides(fill=guide_legend(title="", label.position="left")) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
geoOrgan + raceOrgan +
  plot_layout(design=long2Design) +
  plot_annotation(tag_levels = 'A') 

ggsave("fig3_organ_by_descriptor.pdf", height=8, width=6)
ggsave("fig3_organ_by_descriptor.png", height=8, width=6)

# And now filtering to only those tissues with more than 50 observations

geoOrganSlim <- geographySummary %>% group_by(finalOrgan, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestGeography)) %>% group_by(finalOrgan) %>% filter(sum(value) > 99) %>% as.data.frame %>%
# geoOrgan <- meltGeography %>% drop_na(c(finalOrgan, strictestGeography)) %>%
  ggplot(., aes(x = finalOrgan, y = value, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  xlab(NULL) +
  ylab("Samples") +
  # coord_flip() +
  scale_fill_geography(name="Population\nDescriptor") +
  guides(fill=guide_legend(title="", label.position="left", ncol=3)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
raceOrganSlim <- raceSummary %>% group_by(finalOrgan, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestRace)) %>% group_by(finalOrgan) %>% filter(sum(value) > 99) %>% as.data.frame %>%
# raceOrgan <- meltRace %>% drop_na(c(finalOrgan, strictestRace)) %>%
  ggplot(., aes(x = finalOrgan, y = value, fill = strictestRace)) +
  geom_bar(stat="identity") +
  xlab(NULL) +
  ylab("Samples") +
  # coord_flip() +
  scale_fill_race(name="US Census\nTerm)") +
  guides(fill=guide_legend(title="", label.position="left", ncol=2)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalOrgan'. You can override using the `.groups` argument.
geoOrganSlim + raceOrganSlim +
  plot_layout(design=wide2Design) +
  plot_annotation(tag_levels = 'A') 

ggsave("fig3_organ_by_descriptor_slim.pdf", width = 7, height = 4.5)
ggsave("fig3_organ_by_descriptor_slim.png", width = 7, height = 4.5)

# Some statistics: The percentage of samples that each tissue accounts for, and how many descriptors are associated:
allSRAFinal %>% count(hasDescriptor, finalOrgan) %>% group_by(hasDescriptor) %>% summarise(finalOrgan = finalOrgan, n = n, proportion = n/sum(n)) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'hasDescriptor'. You can override using the `.groups` argument.
##    hasDescriptor           finalOrgan    n   proportion
## 1     Geographic              Adipose   54 5.496183e-03
## 2     Geographic              Bladder   10 1.017812e-03
## 3     Geographic           Blastoderm   22 2.239186e-03
## 4     Geographic                Blood 5076 5.166412e-01
## 5     Geographic         Blood vessel  131 1.333333e-02
## 6     Geographic                 Bone    4 4.071247e-04
## 7     Geographic          Bone marrow  102 1.038168e-02
## 8     Geographic                Brain  612 6.229008e-02
## 9     Geographic               Breast   35 3.562341e-03
## 10    Geographic                  CNS   15 1.526718e-03
## 11    Geographic Cancer sample\n(NOS)   17 1.730280e-03
## 12    Geographic                Heart  195 1.984733e-02
## 13    Geographic                 IPSC  901 9.170483e-02
## 14    Geographic            Intestine  866 8.814249e-02
## 15    Geographic                Joint   92 9.363868e-03
## 16    Geographic               Kidney   24 2.442748e-03
## 17    Geographic                Liver   99 1.007634e-02
## 18    Geographic                 Lung  224 2.279898e-02
## 19    Geographic               Morula   41 4.173028e-03
## 20    Geographic               Muscle  806 8.203562e-02
## 21    Geographic                 Nose   43 4.376590e-03
## 22    Geographic      Pituitary gland    7 7.124682e-04
## 23    Geographic             Placenta   32 3.256997e-03
## 24    Geographic             Prostate  126 1.282443e-02
## 25    Geographic                 Skin  174 1.770992e-02
## 26    Geographic               Testis   15 1.526718e-03
## 27    Geographic              Thyroid   34 3.460560e-03
## 28    Geographic              Trachea   12 1.221374e-03
## 29    Geographic                 <NA>   56 5.699746e-03
## 30        Racial              Adipose   53 3.770900e-03
## 31        Racial        Adrenal gland    3 2.134472e-04
## 32        Racial              Bladder    2 1.422981e-04
## 33        Racial                Blood 8955 6.371398e-01
## 34        Racial         Blood vessel   58 4.126645e-03
## 35        Racial          Bone marrow   52 3.699751e-03
## 36        Racial                Brain 1084 7.712558e-02
## 37        Racial               Breast  466 3.315546e-02
## 38        Racial Cancer sample\n(NOS)  720 5.122732e-02
## 39        Racial            Cartilage    3 2.134472e-04
## 40        Racial      Digestive tract    2 1.422981e-04
## 41        Racial                  Eye   47 3.344006e-03
## 42        Racial                Heart  579 4.119530e-02
## 43        Racial                 IPSC  720 5.122732e-02
## 44        Racial            Intestine  170 1.209534e-02
## 45        Racial               Kidney    3 2.134472e-04
## 46        Racial               Larynx    1 7.114906e-05
## 47        Racial                Liver  208 1.479900e-02
## 48        Racial                 Lung  110 7.826396e-03
## 49        Racial           Lymph node   22 1.565279e-03
## 50        Racial               Muscle   16 1.138385e-03
## 51        Racial                 Nose   71 5.051583e-03
## 52        Racial          Oral cavity   83 5.905372e-03
## 53        Racial                Ovary  222 1.579509e-02
## 54        Racial                  PNS   12 8.537887e-04
## 55        Racial             Pancreas    2 1.422981e-04
## 56        Racial             Prostate   81 5.763074e-03
## 57        Racial                 Skin  101 7.186055e-03
## 58        Racial               Spleen    3 2.134472e-04
## 59        Racial              Stomach   45 3.201708e-03
## 60        Racial               Testis    1 7.114906e-05
## 61        Racial               Thymus    1 7.114906e-05
## 62        Racial              Thyroid    1 7.114906e-05
## 63        Racial               Tonsil    6 4.268943e-04
## 64        Racial        Urinary tract   52 3.699751e-03
## 65        Racial               Uterus   50 3.557453e-03
## 66        Racial               Vagina   16 1.138385e-03
## 67        Racial                 <NA>   34 2.419068e-03
allSRAFinal %>% count(finalOrgan, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(finalOrgan) %>% summarise(geoGroups = length(finalOrgan)) %>% arrange(desc(geoGroups)) %>% as.data.frame
##              finalOrgan geoGroups
## 1                 Blood        10
## 2                 Brain         7
## 3             Intestine         6
## 4                 Joint         6
## 5                 Heart         5
## 6                  IPSC         5
## 7          Blood vessel         4
## 8                  Lung         4
## 9                  Skin         4
## 10          Bone marrow         3
## 11 Cancer sample\n(NOS)         3
## 12               Kidney         3
## 13                Liver         3
## 14               Muscle         3
## 15               Testis         3
## 16                  CNS         2
## 17      Pituitary gland         2
## 18             Prostate         2
## 19              Thyroid         2
## 20              Adipose         1
## 21              Bladder         1
## 22           Blastoderm         1
## 23                 Bone         1
## 24               Breast         1
## 25               Morula         1
## 26                 Nose         1
## 27             Placenta         1
## 28              Trachea         1
## 29                 <NA>         1
allSRAFinal %>% count(finalOrgan, strictestRace) %>% drop_na(strictestRace) %>% group_by(finalOrgan) %>% summarise(raceGroups = length(finalOrgan)) %>% arrange(desc(raceGroups)) %>% as.data.frame
##              finalOrgan raceGroups
## 1                 Blood          8
## 2  Cancer sample\n(NOS)          7
## 3                  Nose          6
## 4          Blood vessel          5
## 5                Breast          5
## 6                 Heart          5
## 7             Intestine          5
## 8                 Ovary          5
## 9           Bone marrow          4
## 10                 IPSC          4
## 11                 Lung          4
## 12          Oral cavity          4
## 13                 Skin          4
## 14              Adipose          3
## 15                Brain          3
## 16               Muscle          3
## 17              Stomach          3
## 18        Urinary tract          3
## 19               Uterus          3
## 20                 <NA>          3
## 21              Bladder          2
## 22            Cartilage          2
## 23                Liver          2
## 24           Lymph node          2
## 25                  PNS          2
## 26             Prostate          2
## 27               Spleen          2
## 28               Tonsil          2
## 29               Vagina          2
## 30        Adrenal gland          1
## 31      Digestive tract          1
## 32                  Eye          1
## 33               Kidney          1
## 34               Larynx          1
## 35             Pancreas          1
## 36               Testis          1
## 37               Thymus          1
## 38              Thyroid          1
# And how many tissues is each descriptor associated with?
allSRAFinal %>% drop_na(strictestGeography, finalOrgan) %>% count(finalOrgan) 
##              finalOrgan    n
## 1               Adipose   54
## 2               Bladder   10
## 3            Blastoderm   22
## 4                 Blood 5076
## 5          Blood vessel  131
## 6                  Bone    4
## 7           Bone marrow  102
## 8                 Brain  612
## 9                Breast   35
## 10                  CNS   15
## 11 Cancer sample\n(NOS)   17
## 12                Heart  195
## 13                 IPSC  901
## 14            Intestine  866
## 15                Joint   92
## 16               Kidney   24
## 17                Liver   99
## 18                 Lung  224
## 19               Morula   41
## 20               Muscle  806
## 21                 Nose   43
## 22      Pituitary gland    7
## 23             Placenta   32
## 24             Prostate  126
## 25                 Skin  174
## 26               Testis   15
## 27              Thyroid   34
## 28              Trachea   12
allSRAFinal %>% drop_na(strictestRace, finalOrgan) %>% count(finalOrgan) 
##              finalOrgan    n
## 1               Adipose   53
## 2         Adrenal gland    3
## 3               Bladder    2
## 4                 Blood 8955
## 5          Blood vessel   58
## 6           Bone marrow   52
## 7                 Brain 1084
## 8                Breast  466
## 9  Cancer sample\n(NOS)  720
## 10            Cartilage    3
## 11      Digestive tract    2
## 12                  Eye   47
## 13                Heart  579
## 14                 IPSC  720
## 15            Intestine  170
## 16               Kidney    3
## 17               Larynx    1
## 18                Liver  208
## 19                 Lung  110
## 20           Lymph node   22
## 21               Muscle   16
## 22                 Nose   71
## 23          Oral cavity   83
## 24                Ovary  222
## 25                  PNS   12
## 26             Pancreas    2
## 27             Prostate   81
## 28                 Skin  101
## 29               Spleen    3
## 30              Stomach   45
## 31               Testis    1
## 32               Thymus    1
## 33              Thyroid    1
## 34               Tonsil    6
## 35        Urinary tract   52
## 36               Uterus   50
## 37               Vagina   16
23/28
## [1] 0.8214286
36/37
## [1] 0.972973
allSRAFinal %>% count(finalOrgan, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(strictestGeography) %>% summarise(geoGroups = length(strictestGeography)) %>% arrange(desc(geoGroups)) %>% as.data.frame
##                strictestGeography geoGroups
## 1                          Europe        23
## 2              Sub-Saharan Africa        16
## 3                       East Asia        13
## 4                      Asia (NOS)        12
## 5                      South Asia         6
## 6                        Americas         6
## 7  North Africa and\nWestern Asia         4
## 8                           Other         3
## 9                        Multiple         2
## 10                 Southeast Asia         1
## 11                        Oceania         1
allSRAFinal %>% count(finalOrgan, strictestRace) %>% drop_na(strictestRace) %>% group_by(strictestRace) %>% summarise(raceGroups = length(strictestRace)) %>% arrange(desc(raceGroups)) %>% as.data.frame
##                                 strictestRace raceGroups
## 1                                       White         36
## 2                  Black or\nAfrican American         25
## 3                                    Hispanic         16
## 4                                       Asian         14
## 5                                    Multiple         12
## 6          American Indian and\nAlaska Native          6
## 7                                       Other          4
## 8 Native Hawaiian and\nother Pacific Islander          1
# And now some alluvial plots, for funsies and supplementary data:
allSRAFinal %>% drop_na(strictestGeography) %>% count(worldRegion)
##                  worldRegion    n
## 1       East Asia &\nPacific 1428
## 2     Europe &\nCentral Asia 3437
## 3 Latin America &\nCaribbean   84
## 4              North America 4810
## 5                 South Asia   26
## 6                       <NA>   40
allSRAFinal %>% drop_na(strictestRace) %>% count(worldRegion)
##                   worldRegion     n
## 1        East Asia &\nPacific   386
## 2      Europe &\nCentral Asia  2520
## 3 Middle East &\nNorth Africa    11
## 4               North America 11138
geoTissueFacet <- geographySummary %>% group_by(finalOrgan, strictestGeography, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestGeography, worldRegion)) %>% as.data.frame %>%
  ggplot(., aes(x = finalOrgan, y = value, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  ggtitle("Samples with geographic/ancestry labels deposited in:") +
  xlab("Sampled tissue") +
  ylab("Samples") +
  scale_fill_geography(name="Population\nDescriptor") +
  guides(fill=guide_legend(title="", nrow=2)) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
  theme(strip.background = element_blank()) +
  facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalOrgan', 'strictestGeography'. You can override using the `.groups` argument.
geoTissueFacet

ggsave("fig3_organ_by_geography_faceted.pdf", width=6, height=9)
ggsave("fig3_organ_by_geography_faceted.png", width=6, height=9)

raceTissueFacet <- raceSummary %>% group_by(finalOrgan, strictestRace, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalOrgan, strictestRace, worldRegion)) %>% as.data.frame %>%
  ggplot(., aes(x = finalOrgan, y = value, fill = strictestRace)) +
  geom_bar(stat="identity") +
  ggtitle("Samples with US Census labels deposited in:") +
  xlab("Sampled tissue") +
  ylab("Samples") +
  scale_fill_race(name="US Census\nTerm)") +
  guides(fill=guide_legend(title="")) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
  theme(strip.background = element_blank()) +
  facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalOrgan', 'strictestRace'. You can override using the `.groups` argument.
raceTissueFacet

ggsave("fig3_organ_by_race_faceted.pdf", width=6, height=9)
ggsave("fig3_organ_by_race_faceted.png", width=6, height=9)

And now we should ask, since these proportions look different, are there differences? Is having a given descriptor associated with higher likelihood or something or something else being sequenced?

We can do this at the hasDescriptor level (racial vs geographic), at the finalCountry level and at the worldRegion level, although we also might want to consider the actual descriptors

But first, an easier question: Is blood really more diverse than anything else?

bloodPropGeo <- allSRAFinal %>% filter(grepl("Blood", finalOrgan) & !is.na(strictestGeography)) %>% 
  count(strictestGeography) %>% 
  mutate(freq = n/sum(n))

nobloodPropGeo <- allSRAFinal %>% filter(!grepl("Blood", finalOrgan) & !is.na(strictestGeography)) %>% 
  count(strictestGeography) %>% 
  mutate(freq = n/sum(n))

bloodPropRace <- allSRAFinal %>% filter(grepl("Blood", finalOrgan) & !is.na(strictestRace)) %>% 
  count(strictestRace) %>% 
  mutate(freq = n/sum(n))
  
nobloodPropRace <- allSRAFinal %>% filter(!grepl("Blood", finalOrgan) & !is.na(strictestRace)) %>% 
  count(strictestRace) %>% 
  mutate(freq = n/sum(n))

geoBlood <- full_join(bloodPropGeo, nobloodPropGeo, by="strictestGeography", suffix=c("Blood", "NoBlood")) %>%
  full_join(., geographyProp, by="strictestGeography") %>%
  mutate(across(where(is.numeric), ~replace(., is.na(.), 0)))

raceBlood <- full_join(bloodPropRace, nobloodPropRace, by="strictestRace", suffix=c("Blood", "NoBlood")) %>%
  full_join(., raceProp, by="strictestRace") %>%
  mutate(across(where(is.numeric), ~replace(., is.na(.), 0)))

chisq.test(geoBlood[,c(2,6)])
## Warning in chisq.test(geoBlood[, c(2, 6)]): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  geoBlood[, c(2, 6)]
## X-squared = 509.04, df = 10, p-value < 2.2e-16
chisq.test(raceBlood[,c(2,6)])
## Warning in chisq.test(raceBlood[, c(2, 6)]): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  raceBlood[, c(2, 6)]
## X-squared = 114.6, df = 7, p-value < 2.2e-16
# Both are significant but blood is clearly a lot more diverse
geoBlood
##                strictestGeography nBlood   freqBlood nNoBlood freqNoBlood    n        freq
## 1              Sub-Saharan Africa   1402 0.269252929      292 0.063230836 1694 0.172417303
## 2  North Africa and\nWestern Asia     12 0.002304590        6 0.001299264   18 0.001832061
## 3                          Europe   2194 0.421355867     3275 0.709181464 5469 0.556641221
## 4                      South Asia    693 0.133090071       25 0.005413599  718 0.073078880
## 5                  Southeast Asia     49 0.009410409        0 0.000000000   49 0.004987277
## 6                       East Asia    474 0.091031304      647 0.140103941 1121 0.114096692
## 7                      Asia (NOS)    109 0.020933359      201 0.043525336  310 0.031552163
## 8                        Americas     39 0.007489917      127 0.027501083  166 0.016895674
## 9                        Multiple    180 0.034568850       27 0.005846687  207 0.021068702
## 10                          Other     55 0.010562704        7 0.001515808   62 0.006310433
## 11                        Oceania      0 0.000000000       11 0.002381984   11 0.001119593
raceBlood
##                                 strictestRace nBlood    freqBlood nNoBlood freqNoBlood    n         freq
## 1          American Indian and\nAlaska Native     34 0.0037723289        9 0.001785006   43 0.0030594095
## 2                                       Asian    536 0.0594696549      408 0.080920270  944 0.0671647101
## 3                  Black or\nAfrican American    849 0.0941972706      893 0.177112257 1742 0.1239416578
## 4                                    Hispanic   1056 0.1171640963      199 0.039468465 1255 0.0892920669
## 5                                    Multiple    178 0.0197492511       25 0.004958350  203 0.0144432586
## 6 Native Hawaiian and\nother Pacific Islander      6 0.0006657051        0 0.000000000    6 0.0004268943
## 7                                       Other    155 0.0171973816        7 0.001388338  162 0.0115261473
## 8                                       White   6199 0.6877843115     3501 0.694367315 9700 0.6901458556
allPropGeo <- allSRAFinal %>% filter(!is.na(worldRegion)) %>% 
  group_by(worldRegion) %>%
  count(finalOrgan) %>% 
  as.data.frame() 

# Not very promising...
anova(lm(n ~ finalOrgan + worldRegion, data=allPropGeo))
## Analysis of Variance Table
## 
## Response: n
##             Df   Sum Sq Mean Sq F value Pr(>F)  
## finalOrgan  44 45284988 1029204  0.9390 0.5823  
## worldRegion  5 15606168 3121234  2.8476 0.0303 *
## Residuals   33 36170641 1096080                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
allPropGeo <- allPropGeo %>% 
  cast(., worldRegion ~ finalOrgan) %>%
  mutate(across(where(is.numeric), ~replace(., is.na(.), 0)))
## Using n as value column.  Use the value argument to cast to override this choice
# Boring, uninterpretable, cannot be bothered to go any further. Keep it descriptive
chisq.test(allPropGeo[,2:ncol(allPropGeo)])
## Warning in chisq.test(allPropGeo[, 2:ncol(allPropGeo)]): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  allPropGeo[, 2:ncol(allPropGeo)]
## X-squared = 19965, df = 225, p-value < 2.2e-16

3.5. What about diseases? Are there associations between place and type?

geoDisease <- geographySummary %>% group_by(finalDisease, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestGeography)) %>% as.data.frame %>%
# geoDisease <- meltGeography %>% drop_na(c(finalDisease, strictestGeography)) %>%
  ggplot(., aes(x = fct_rev(finalDisease), y = value, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  xlab("") +
  ylab("Samples") +
  coord_flip() +
  scale_fill_geography(name="Population\nDescriptor") +
  guides(fill=guide_legend(title="", label.position="left", ncol=1)) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
raceDisease <- raceSummary %>% group_by(finalDisease, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestRace)) %>% as.data.frame %>%
# raceDisease <- meltRace %>% drop_na(c(finalDisease, strictestRace)) %>%
  ggplot(., aes(x = fct_rev(finalDisease), y = value, fill = strictestRace)) +
  geom_bar(stat="identity") +
  xlab("Sampled tissue") +
  ylab("Samples") +
  coord_flip() +
  scale_fill_race(name="US Census\nTerm)") +
  guides(fill=guide_legend(title="", label.position="left")) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,0), legend.position = c(1,0), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
geoDisease + raceDisease +
  plot_layout(design=long2Design) +
  plot_annotation(tag_levels = 'A') 

ggsave("fig4_disease_by_descriptor.pdf", height=8, width=6)
ggsave("fig4_disease_by_descriptor.png", height=8, width=6)

# And now filtering to only those tissues with more than 50 observations

geoDiseaseSlim <- geographySummary %>% group_by(finalDisease, strictestGeography) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestGeography)) %>% group_by(finalDisease) %>% filter(sum(value) > 9) %>% as.data.frame %>%
# geoDisease <- meltGeography %>% drop_na(c(finalDisease, strictestGeography)) %>%
  ggplot(., aes(x = finalDisease, y = value, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  xlab(NULL) +
  ylab("Samples") +
  # coord_flip() +
  scale_fill_geography(name="Population\nDescriptor") +
  guides(fill=guide_legend(title="", label.position="left", ncol=3)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
raceDiseaseSlim <- raceSummary %>% group_by(finalDisease, strictestRace) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestRace)) %>% group_by(finalDisease) %>% filter(sum(value) > 9) %>% as.data.frame %>%
# raceDisease <- meltRace %>% drop_na(c(finalDisease, strictestRace)) %>%
  ggplot(., aes(x = finalDisease, y = value, fill = strictestRace)) +
  geom_bar(stat="identity") +
  xlab(NULL) +
  ylab("Samples") +
  # coord_flip() +
  scale_fill_race(name="US Census\nTerm)") +
  guides(fill=guide_legend(title="", label.position="left", ncol=2)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
  theme(legend.title = element_blank(), legend.direction = "vertical", legend.justification = c(1,1), legend.position = c(1,1), legend.background=element_blank())
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
geoDiseaseSlim + raceDiseaseSlim +
  plot_layout(design=wide2Design) +
  plot_annotation(tag_levels = 'A') 

ggsave("fig4_disease_by_descriptor_slim.pdf", width = 7, height = 4.5)
ggsave("fig4_disease_by_descriptor_slim.png", width = 7, height = 4.5)

# Some statistics: The percentage of samples that each tissue accounts for, and how many descriptors are associated:
allSRAFinal %>% count(hasDescriptor, finalDisease) %>% group_by(hasDescriptor) %>% summarise(finalDisease = finalDisease, n = n, proportion = n/sum(n)) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'hasDescriptor'. You can override using the `.groups` argument.
##    hasDescriptor      finalDisease     n   proportion
## 1     Geographic      Acute trauma    12 0.0012213740
## 2     Geographic        Autoimmune   674 0.0686005089
## 3     Geographic             Blood    24 0.0024427481
## 4     Geographic            Cancer   551 0.0560814249
## 5     Geographic    Cardiovascular    44 0.0044783715
## 6     Geographic         Endocrine    10 0.0010178117
## 7     Geographic  Gastrointestinal     7 0.0007124682
## 8     Geographic  Genetic syndrome    57 0.0058015267
## 9     Geographic   Healthy control  1134 0.1154198473
## 10    Geographic        Infectious   276 0.0280916031
## 11    Geographic     Integumentary     7 0.0007124682
## 12    Geographic            Kidney    17 0.0017302799
## 13    Geographic     Mental health    28 0.0028498728
## 14    Geographic         Metabolic     7 0.0007124682
## 15    Geographic Neurodegenerative    72 0.0073282443
## 16    Geographic      Neurological     6 0.0006106870
## 17    Geographic             Other     7 0.0007124682
## 18    Geographic      Reproductive    10 0.0010178117
## 19    Geographic       Respiratory   107 0.0108905852
## 20    Geographic              <NA>  6775 0.6895674300
## 21        Racial        Autoimmune   309 0.0219850587
## 22        Racial             Blood     5 0.0003557453
## 23        Racial            Cancer   981 0.0697972252
## 24        Racial    Cardiovascular    13 0.0009249377
## 25        Racial   Healthy control   930 0.0661686233
## 26        Racial        Infectious    61 0.0043400925
## 27        Racial     Mental health   437 0.0310921380
## 28        Racial         Metabolic   227 0.0161508360
## 29        Racial Neurodegenerative   111 0.0078975454
## 30        Racial      Neurological     8 0.0005691925
## 31        Racial             Other     2 0.0001422981
## 32        Racial      Reproductive     7 0.0004980434
## 33        Racial       Respiratory    20 0.0014229811
## 34        Racial              <NA> 10944 0.7786552828
allSRAFinal %>% count(worldRegion, finalDisease) %>% group_by(worldRegion) %>% summarise(finalDisease = finalDisease, n = n, proportion = n/sum(n)) %>% arrange(finalDisease) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'worldRegion'. You can override using the `.groups` argument.
##                    worldRegion      finalDisease     n   proportion
## 1         East Asia &\nPacific      Acute trauma    12 6.615215e-03
## 2         East Asia &\nPacific        Autoimmune    98 5.402426e-02
## 3       Europe &\nCentral Asia        Autoimmune    32 5.371831e-03
## 4   Latin America &\nCaribbean        Autoimmune     1 1.190476e-02
## 5                North America        Autoimmune   852 5.342363e-02
## 6         East Asia &\nPacific             Blood    27 1.488423e-02
## 7   Latin America &\nCaribbean             Blood     1 1.190476e-02
## 8                North America             Blood     1 6.270379e-05
## 9         East Asia &\nPacific            Cancer   380 2.094818e-01
## 10      Europe &\nCentral Asia            Cancer   208 3.491690e-02
## 11  Latin America &\nCaribbean            Cancer    23 2.738095e-01
## 12               North America            Cancer   907 5.687234e-02
## 13                        <NA>            Cancer    14 3.500000e-01
## 14        East Asia &\nPacific    Cardiovascular    27 1.488423e-02
## 15      Europe &\nCentral Asia    Cardiovascular    15 2.518046e-03
## 16               North America    Cardiovascular    13 8.151492e-04
## 17                  South Asia    Cardiovascular     2 7.692308e-02
## 18        East Asia &\nPacific         Endocrine    10 5.512679e-03
## 19        East Asia &\nPacific  Gastrointestinal     5 2.756340e-03
## 20               North America  Gastrointestinal     2 1.254076e-04
## 21      Europe &\nCentral Asia  Genetic syndrome    45 7.554138e-03
## 22               North America  Genetic syndrome    12 7.524454e-04
## 23        East Asia &\nPacific   Healthy control   122 6.725469e-02
## 24      Europe &\nCentral Asia   Healthy control   834 1.400034e-01
## 25  Latin America &\nCaribbean   Healthy control    27 3.214286e-01
## 26               North America   Healthy control  1061 6.652872e-02
## 27                  South Asia   Healthy control     2 7.692308e-02
## 28                        <NA>   Healthy control    18 4.500000e-01
## 29      Europe &\nCentral Asia        Infectious   208 3.491690e-02
## 30  Latin America &\nCaribbean        Infectious     2 2.380952e-02
## 31               North America        Infectious   127 7.963381e-03
## 32        East Asia &\nPacific     Integumentary     7 3.858875e-03
## 33        East Asia &\nPacific            Kidney    17 9.371555e-03
## 34        East Asia &\nPacific     Mental health     8 4.410143e-03
## 35               North America     Mental health   457 2.865563e-02
## 36      Europe &\nCentral Asia         Metabolic    16 2.685916e-03
## 37               North America         Metabolic   218 1.366943e-02
## 38        East Asia &\nPacific Neurodegenerative    20 1.102536e-02
## 39      Europe &\nCentral Asia Neurodegenerative    19 3.189525e-03
## 40  Latin America &\nCaribbean Neurodegenerative    15 1.785714e-01
## 41               North America Neurodegenerative   129 8.088789e-03
## 42        East Asia &\nPacific      Neurological     6 3.307607e-03
## 43               North America      Neurological     8 5.016303e-04
## 44        East Asia &\nPacific             Other     9 4.961411e-03
## 45        East Asia &\nPacific      Reproductive     7 3.858875e-03
## 46                  South Asia      Reproductive    10 3.846154e-01
## 47        East Asia &\nPacific       Respiratory    25 1.378170e-02
## 48      Europe &\nCentral Asia       Respiratory    48 8.057747e-03
## 49               North America       Respiratory    54 3.386005e-03
## 50        East Asia &\nPacific              <NA>  1034 5.700110e-01
## 51      Europe &\nCentral Asia              <NA>  4532 7.607856e-01
## 52  Latin America &\nCaribbean              <NA>    15 1.785714e-01
## 53 Middle East &\nNorth Africa              <NA>    11 1.000000e+00
## 54               North America              <NA> 12107 7.591548e-01
## 55                  South Asia              <NA>    12 4.615385e-01
## 56                        <NA>              <NA>     8 2.000000e-01
allSRAFinal %>% count(worldRegion, finalDisease) %>% group_by(finalDisease) %>% summarise(worldRegion = worldRegion, n = n, proportion = n/sum(n)) %>% arrange(finalDisease) %>% as.data.frame
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'finalDisease'. You can override using the `.groups` argument.
##         finalDisease                 worldRegion     n   proportion
## 1       Acute trauma        East Asia &\nPacific    12 1.0000000000
## 2         Autoimmune        East Asia &\nPacific    98 0.0996948118
## 3         Autoimmune      Europe &\nCentral Asia    32 0.0325534079
## 4         Autoimmune  Latin America &\nCaribbean     1 0.0010172940
## 5         Autoimmune               North America   852 0.8667344863
## 6              Blood        East Asia &\nPacific    27 0.9310344828
## 7              Blood  Latin America &\nCaribbean     1 0.0344827586
## 8              Blood               North America     1 0.0344827586
## 9             Cancer        East Asia &\nPacific   380 0.2480417755
## 10            Cancer      Europe &\nCentral Asia   208 0.1357702350
## 11            Cancer  Latin America &\nCaribbean    23 0.0150130548
## 12            Cancer               North America   907 0.5920365535
## 13            Cancer                        <NA>    14 0.0091383812
## 14    Cardiovascular        East Asia &\nPacific    27 0.4736842105
## 15    Cardiovascular      Europe &\nCentral Asia    15 0.2631578947
## 16    Cardiovascular               North America    13 0.2280701754
## 17    Cardiovascular                  South Asia     2 0.0350877193
## 18         Endocrine        East Asia &\nPacific    10 1.0000000000
## 19  Gastrointestinal        East Asia &\nPacific     5 0.7142857143
## 20  Gastrointestinal               North America     2 0.2857142857
## 21  Genetic syndrome      Europe &\nCentral Asia    45 0.7894736842
## 22  Genetic syndrome               North America    12 0.2105263158
## 23   Healthy control        East Asia &\nPacific   122 0.0591085271
## 24   Healthy control      Europe &\nCentral Asia   834 0.4040697674
## 25   Healthy control  Latin America &\nCaribbean    27 0.0130813953
## 26   Healthy control               North America  1061 0.5140503876
## 27   Healthy control                  South Asia     2 0.0009689922
## 28   Healthy control                        <NA>    18 0.0087209302
## 29        Infectious      Europe &\nCentral Asia   208 0.6172106825
## 30        Infectious  Latin America &\nCaribbean     2 0.0059347181
## 31        Infectious               North America   127 0.3768545994
## 32     Integumentary        East Asia &\nPacific     7 1.0000000000
## 33            Kidney        East Asia &\nPacific    17 1.0000000000
## 34     Mental health        East Asia &\nPacific     8 0.0172043011
## 35     Mental health               North America   457 0.9827956989
## 36         Metabolic      Europe &\nCentral Asia    16 0.0683760684
## 37         Metabolic               North America   218 0.9316239316
## 38 Neurodegenerative        East Asia &\nPacific    20 0.1092896175
## 39 Neurodegenerative      Europe &\nCentral Asia    19 0.1038251366
## 40 Neurodegenerative  Latin America &\nCaribbean    15 0.0819672131
## 41 Neurodegenerative               North America   129 0.7049180328
## 42      Neurological        East Asia &\nPacific     6 0.4285714286
## 43      Neurological               North America     8 0.5714285714
## 44             Other        East Asia &\nPacific     9 1.0000000000
## 45      Reproductive        East Asia &\nPacific     7 0.4117647059
## 46      Reproductive                  South Asia    10 0.5882352941
## 47       Respiratory        East Asia &\nPacific    25 0.1968503937
## 48       Respiratory      Europe &\nCentral Asia    48 0.3779527559
## 49       Respiratory               North America    54 0.4251968504
## 50              <NA>        East Asia &\nPacific  1034 0.0583554377
## 51              <NA>      Europe &\nCentral Asia  4532 0.2557706417
## 52              <NA>  Latin America &\nCaribbean    15 0.0008465489
## 53              <NA> Middle East &\nNorth Africa    11 0.0006208025
## 54              <NA>               North America 12107 0.6832778373
## 55              <NA>                  South Asia    12 0.0006772391
## 56              <NA>                        <NA>     8 0.0004514927
allSRAFinal %>% count(finalDisease, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(finalDisease) %>% summarise(geoGroups = length(finalDisease)) %>% arrange(desc(geoGroups)) %>% as.data.frame
##         finalDisease geoGroups
## 1               <NA>        11
## 2    Healthy control         8
## 3         Autoimmune         7
## 4             Cancer         5
## 5         Infectious         5
## 6        Respiratory         4
## 7     Cardiovascular         3
## 8  Neurodegenerative         3
## 9       Acute trauma         2
## 10             Blood         2
## 11  Gastrointestinal         2
## 12     Mental health         2
## 13         Metabolic         2
## 14         Endocrine         1
## 15  Genetic syndrome         1
## 16     Integumentary         1
## 17            Kidney         1
## 18      Neurological         1
## 19             Other         1
## 20      Reproductive         1
allSRAFinal %>% count(finalDisease, strictestRace) %>% drop_na(strictestRace) %>% group_by(finalDisease) %>% summarise(raceGroups = length(finalDisease)) %>% arrange(desc(raceGroups)) %>% as.data.frame
##         finalDisease raceGroups
## 1               <NA>          8
## 2             Cancer          7
## 3         Autoimmune          6
## 4    Healthy control          6
## 5          Metabolic          4
## 6         Infectious          3
## 7      Mental health          3
## 8       Neurological          3
## 9              Blood          2
## 10 Neurodegenerative          2
## 11    Cardiovascular          1
## 12             Other          1
## 13      Reproductive          1
## 14       Respiratory          1
# And how many tissues is each descriptor associated with?
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(finalDisease) 
##         finalDisease    n
## 1       Acute trauma   12
## 2         Autoimmune  674
## 3              Blood   24
## 4             Cancer  551
## 5     Cardiovascular   44
## 6          Endocrine   10
## 7   Gastrointestinal    7
## 8   Genetic syndrome   57
## 9    Healthy control 1134
## 10        Infectious  276
## 11     Integumentary    7
## 12            Kidney   17
## 13     Mental health   28
## 14         Metabolic    7
## 15 Neurodegenerative   72
## 16      Neurological    6
## 17             Other    7
## 18      Reproductive   10
## 19       Respiratory  107
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(finalDisease) 
##         finalDisease   n
## 1         Autoimmune 309
## 2              Blood   5
## 3             Cancer 981
## 4     Cardiovascular  13
## 5    Healthy control 930
## 6         Infectious  61
## 7      Mental health 437
## 8          Metabolic 227
## 9  Neurodegenerative 111
## 10      Neurological   8
## 11             Other   2
## 12      Reproductive   7
## 13       Respiratory  20
allSRAFinal %>% count(finalDisease, strictestGeography) %>% drop_na(strictestGeography) %>% group_by(strictestGeography) %>% summarise(geoGroups = length(strictestGeography)) %>% arrange(desc(geoGroups)) %>% as.data.frame
##                strictestGeography geoGroups
## 1                          Europe        13
## 2                       East Asia        11
## 3              Sub-Saharan Africa         8
## 4                        Americas         7
## 5                      South Asia         6
## 6                      Asia (NOS)         5
## 7                        Multiple         4
## 8                           Other         4
## 9  North Africa and\nWestern Asia         3
## 10                 Southeast Asia         1
## 11                        Oceania         1
allSRAFinal %>% count(finalDisease, strictestRace) %>% drop_na(strictestRace) %>% group_by(strictestRace) %>% summarise(raceGroups = length(strictestRace)) %>% arrange(desc(raceGroups)) %>% as.data.frame
##                                 strictestRace raceGroups
## 1                                    Hispanic         10
## 2                                       White         10
## 3                  Black or\nAfrican American          9
## 4                                       Asian          8
## 5                                       Other          4
## 6                                    Multiple          3
## 7          American Indian and\nAlaska Native          2
## 8 Native Hawaiian and\nother Pacific Islander          2
# And now some alluvial plots, for funsies and supplementary data:
geoDiseaseFacet <- geographySummary %>% group_by(finalDisease, strictestGeography, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestGeography, worldRegion)) %>% as.data.frame %>%
  ggplot(., aes(x = finalDisease, y = value, fill = strictestGeography)) +
  geom_bar(stat="identity") +
  ggtitle("Samples with geographic/ancestry labels deposited in:") +
  xlab("Sampled tissue") +
  ylab("Samples") +
  scale_fill_geography(name="Population\nDescriptor") +
  guides(fill=guide_legend(title="", nrow=2)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
  theme(strip.background = element_blank()) +
  facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalDisease', 'strictestGeography'. You can override using the `.groups` argument.
geoDiseaseFacet

ggsave("fig4_disease_by_geography_faceted.pdf", width=6, height=9)
ggsave("fig4_disease_by_geography_faceted.png", width=6, height=9)

raceDiseaseFacet <- raceSummary %>% group_by(finalDisease, strictestRace, worldRegion) %>% summarise(value = sum(n)) %>% distinct() %>% drop_na(c(finalDisease, strictestRace, worldRegion)) %>% as.data.frame %>%
  ggplot(., aes(x = finalDisease, y = value, fill = strictestRace)) +
  geom_bar(stat="identity") +
  ggtitle("Samples with US Census labels deposited in:") +
  xlab("Sampled tissue") +
  ylab("Samples") +
  scale_fill_race(name="US Census\nTerm)") +
  guides(fill=guide_legend(title="")) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal") +
  theme(strip.background = element_blank()) +
  facet_wrap(~worldRegion, ncol=1, scales="free_y")
## `summarise()` has grouped output by 'finalDisease', 'strictestRace'. You can override using the `.groups` argument.
raceDiseaseFacet

ggsave("fig4_disease_by_race_faceted.pdf", width=6, height=9)
ggsave("fig4_disease_by_race_faceted.png", width=6, height=9)

4. Attempting to look at some of the relationships between classes:

What we really want to get to is the relationship between tissue and disease, because we think it might be interesting.

# Let's collapse tissues with under 100 observations and diseases with under 10 into "Other"
geoTissuesToKeep <- allSRAFinal %>% drop_na(strictestGeography) %>% count(finalOrgan) %>% filter(n >= 100)
raceTissuesToKeep <- allSRAFinal %>% drop_na(strictestRace) %>% count(finalOrgan) %>% filter(n >= 100)

geoDiseasesToKeep <- allSRAFinal %>% drop_na(strictestGeography) %>% count(finalDisease) %>% filter(n >= 10)
raceDiseasesToKeep <- allSRAFinal %>% drop_na(strictestRace) %>% count(finalDisease) %>% filter(n >= 10)

geoDisAlluvial <- allSRAFinal %>% drop_na(c(strictestGeography, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestGeography, finalOrgan, finalDisease) %>%
filter(grepl(paste(geoTissuesToKeep$finalOrgan, collapse="|"), finalOrgan)) %>%  
filter(grepl(paste(geoDiseasesToKeep$finalDisease, collapse="|"), finalDisease)) %>%  
    ggplot(data = .,
       aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
  scale_x_discrete(limits = c("SRA Submitter\nWBER", "Disease class", "Tissue sequenced"), expand = c(.2, .05)) +
  geom_alluvium(aes(fill = strictestGeography)) +
  scale_fill_geography() +
  geom_stratum(width=1/4) +
  ylab("Samples") +
  # geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
  geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
  theme_minimal(base_size = 6) +
  guides(fill=guide_legend(title="Descriptor", nrow=2)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")

raceDisAlluvial <- allSRAFinal %>% drop_na(c(strictestRace, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestRace, finalOrgan, finalDisease) %>% 
  filter(grepl(paste(raceTissuesToKeep$finalOrgan, collapse="|"), finalOrgan)) %>%  
  filter(grepl(paste(raceDiseasesToKeep$finalDisease, collapse="|"), finalDisease)) %>%  
ggplot(data = .,
       aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
  scale_x_discrete(limits = c("SRA Submitter\nWBER", "Tissue sequenced", "Disease class"), expand = c(.2, .05)) +
  geom_alluvium(aes(fill = strictestRace)) +
  scale_fill_race() +
  geom_stratum(width=1/4) +
  ylab("Samples") +
  # geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
  geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
  theme_minimal(base_size = 6) +
  guides(fill=guide_legend(title="Descriptor", nrow=2)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")

geoDisAlluvial + raceDisAlluvial +
  plot_layout(design=long2Design) +
  plot_annotation(tag_levels = 'A') 
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.

ggsave("fig4_organ_disease_alluvial.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
geoDisAlluvial <- allSRAFinal %>% drop_na(c(strictestGeography, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestGeography, finalOrgan, finalDisease) %>%
    ggplot(data = .,
       aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
  scale_x_discrete(limits = c("SRA Submitter\nWBER", "Disease class", "Tissue sequenced"), expand = c(.2, .05)) +
  geom_alluvium(aes(fill = strictestGeography)) +
  scale_fill_geography() +
  geom_stratum(width=1/4) +
  ylab("Samples") +
  # geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
  geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
  theme_minimal(base_size = 6) +
  guides(fill=guide_legend(title="Descriptor", nrow=2)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")

geoDisAlluvial
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.

ggsave("fig4_organ_disease_geo_alluvial_full.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
ggsave("fig4_organ_disease_geo_alluvial_full.png", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
raceDisAlluvial <- allSRAFinal %>% drop_na(c(strictestRace, worldRegion, finalOrgan, finalDisease)) %>% count(worldRegion, strictestRace, finalOrgan, finalDisease) %>% 
ggplot(data = .,
       aes(axis1 = worldRegion, axis2 = finalOrgan, axis3 = finalDisease, y = n)) +
  scale_x_discrete(limits = c("SRA Submitter\nWBER", "Tissue sequenced", "Disease class"), expand = c(.2, .05)) +
  geom_alluvium(aes(fill = strictestRace)) +
  scale_fill_race() +
  geom_stratum(width=1/4) +
  ylab("Samples") +
  # geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=2.5) +
  geom_text_repel(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5, direction = "y", nudge_x = .5) +
  theme_minimal(base_size = 6) +
  guides(fill=guide_legend(title="Descriptor", nrow=2)) +
  theme(legend.title = element_blank(), legend.position="bottom", legend.direction="horizontal")

raceDisAlluvial 
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps

ggsave("fig4_organ_disease_race_alluvial_full.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
ggsave("fig4_organ_disease_race_alluvial_full.png", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
geoDisAlluvial + raceDisAlluvial +
  plot_layout(design=long2Design) +
  plot_annotation(tag_levels = 'A') 
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider increasing max.overlaps

ggsave("fig4_organ_disease_alluvial_full.pdf", width = 7, height = 7)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern = params$discern): Some strata appear at multiple axes.

And finally… the numbers that underlie these plots, because otherwise I’ll go crazy trying to tabulate things:

allSRAFinal <- allSRAFinal %>% 
  mutate(hasDisease = if_else(is.na(finalDisease), "No", "Yes"))

# Who has disease info?
allSRAFinal %>% count(hasDisease) %>% mutate(freq = n/sum(n))
##   hasDisease     n      freq
## 1         No 17719 0.7420017
## 2        Yes  6161 0.2579983
allSRAFinal %>% drop_na(strictestGeography) %>% count(hasDisease) %>% mutate(freq = n/sum(n))
##   hasDisease    n      freq
## 1         No 6775 0.6895674
## 2        Yes 3050 0.3104326
allSRAFinal %>% drop_na(strictestRace) %>% count(hasDisease) %>% mutate(freq = n/sum(n))
##   hasDisease     n      freq
## 1         No 10944 0.7786553
## 2        Yes  3111 0.2213447
# Some of these could work better as plots:
ggplot(allSRAFinal, aes(x = finalOrgan, fill = hasDisease)) +
  geom_bar(position = "fill") +
  ggtitle("") +
  xlab("Sequenced tissue") +
  ylab("Proportion of samples") +
  coord_fixed(ratio=6) +
  guides(fill=guide_legend(title="Disease info?")) +
  theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5))

allSRAFinal %>% drop_na(finalOrgan) %>% count(hasDisease) %>% mutate(freq = n/sum(n))
##   hasDisease     n      freq
## 1         No 17707 0.7443043
## 2        Yes  6083 0.2556957
allSRAFinal %>% drop_na(finalOrgan) %>% count(finalOrgan, hasDisease) %>% group_by(finalOrgan) %>% mutate(freq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##              finalOrgan hasDisease     n        freq
## 1               Adipose         No   104 0.971962617
## 2               Adipose        Yes     3 0.028037383
## 3         Adrenal gland        Yes     3 1.000000000
## 4               Bladder         No     1 0.083333333
## 5               Bladder        Yes    11 0.916666667
## 6            Blastoderm         No    22 1.000000000
## 7                 Blood         No 12489 0.890100492
## 8                 Blood        Yes  1542 0.109899508
## 9          Blood vessel         No   102 0.539682540
## 10         Blood vessel        Yes    87 0.460317460
## 11                 Bone         No     4 1.000000000
## 12          Bone marrow         No    62 0.402597403
## 13          Bone marrow        Yes    92 0.597402597
## 14                Brain         No   969 0.571344340
## 15                Brain        Yes   727 0.428655660
## 16               Breast         No   353 0.704590818
## 17               Breast        Yes   148 0.295409182
## 18                  CNS         No     8 0.533333333
## 19                  CNS        Yes     7 0.466666667
## 20 Cancer sample\n(NOS)        Yes   737 1.000000000
## 21            Cartilage         No     3 1.000000000
## 22      Digestive tract        Yes     2 1.000000000
## 23                  Eye         No    47 1.000000000
## 24                Heart         No   752 0.971576227
## 25                Heart        Yes    22 0.028423773
## 26                 IPSC         No   541 0.333744602
## 27                 IPSC        Yes  1080 0.666255398
## 28            Intestine         No   263 0.253861004
## 29            Intestine        Yes   773 0.746138996
## 30                Joint        Yes    92 1.000000000
## 31               Kidney         No    25 0.925925926
## 32               Kidney        Yes     2 0.074074074
## 33               Larynx        Yes     1 1.000000000
## 34                Liver         No    63 0.205211726
## 35                Liver        Yes   244 0.794788274
## 36                 Lung         No   282 0.844311377
## 37                 Lung        Yes    52 0.155688623
## 38           Lymph node         No    20 0.909090909
## 39           Lymph node        Yes     2 0.090909091
## 40               Morula         No    41 1.000000000
## 41               Muscle         No   803 0.976885645
## 42               Muscle        Yes    19 0.023114355
## 43                 Nose         No    79 0.692982456
## 44                 Nose        Yes    35 0.307017544
## 45          Oral cavity         No    66 0.795180723
## 46          Oral cavity        Yes    17 0.204819277
## 47                Ovary         No   221 0.995495495
## 48                Ovary        Yes     1 0.004504505
## 49                  PNS         No    12 1.000000000
## 50             Pancreas        Yes     2 1.000000000
## 51      Pituitary gland         No     7 1.000000000
## 52             Placenta         No    32 1.000000000
## 53             Prostate         No    65 0.314009662
## 54             Prostate        Yes   142 0.685990338
## 55                 Skin         No   155 0.563636364
## 56                 Skin        Yes   120 0.436363636
## 57               Spleen        Yes     3 1.000000000
## 58              Stomach         No     1 0.022222222
## 59              Stomach        Yes    44 0.977777778
## 60               Testis         No     1 0.062500000
## 61               Testis        Yes    15 0.937500000
## 62               Thymus        Yes     1 1.000000000
## 63              Thyroid         No     6 0.171428571
## 64              Thyroid        Yes    29 0.828571429
## 65               Tonsil        Yes     6 1.000000000
## 66              Trachea        Yes    12 1.000000000
## 67        Urinary tract         No    52 1.000000000
## 68               Uterus         No    40 0.800000000
## 69               Uterus        Yes    10 0.200000000
## 70               Vagina         No    16 1.000000000
# Increasing levels of disease and geography granularity:
allSRAFinal %>% drop_na(strictestGeography) %>% count(strictestGeography, hasDisease) %>% mutate(freq = n/sum(n))
##                strictestGeography hasDisease    n         freq
## 1              Sub-Saharan Africa         No 1288 0.1310941476
## 2              Sub-Saharan Africa        Yes  406 0.0413231552
## 3  North Africa and\nWestern Asia         No   15 0.0015267176
## 4  North Africa and\nWestern Asia        Yes    3 0.0003053435
## 5                          Europe         No 3735 0.3801526718
## 6                          Europe        Yes 1734 0.1764885496
## 7                      South Asia         No  688 0.0700254453
## 8                      South Asia        Yes   30 0.0030534351
## 9                  Southeast Asia         No   49 0.0049872774
## 10                      East Asia         No  683 0.0695165394
## 11                      East Asia        Yes  438 0.0445801527
## 12                     Asia (NOS)         No  173 0.0176081425
## 13                     Asia (NOS)        Yes  137 0.0139440204
## 14                        Oceania         No   11 0.0011195929
## 15                       Americas         No   56 0.0056997455
## 16                       Americas        Yes  110 0.0111959288
## 17                       Multiple         No   27 0.0027480916
## 18                       Multiple        Yes  180 0.0183206107
## 19                          Other         No   50 0.0050890585
## 20                          Other        Yes   12 0.0012213740
allSRAFinal %>% drop_na(strictestRace) %>% count(strictestRace, hasDisease) %>% mutate(freq = n/sum(n))
##                                  strictestRace hasDisease    n         freq
## 1           American Indian and\nAlaska Native         No   40 2.845962e-03
## 2           American Indian and\nAlaska Native        Yes    3 2.134472e-04
## 3                                        Asian         No  718 5.108502e-02
## 4                                        Asian        Yes  226 1.607969e-02
## 5                   Black or\nAfrican American         No 1395 9.925293e-02
## 6                   Black or\nAfrican American        Yes  347 2.468872e-02
## 7                                     Hispanic         No 1058 7.527570e-02
## 8                                     Hispanic        Yes  197 1.401636e-02
## 9                                     Multiple         No  182 1.294913e-02
## 10                                    Multiple        Yes   21 1.494130e-03
## 11 Native Hawaiian and\nother Pacific Islander         No    5 3.557453e-04
## 12 Native Hawaiian and\nother Pacific Islander        Yes    1 7.114906e-05
## 13                                       Other         No  158 1.124155e-02
## 14                                       Other        Yes    4 2.845962e-04
## 15                                       White         No 7388 5.256492e-01
## 16                                       White        Yes 2312 1.644966e-01
allSRAFinal %>% drop_na(worldRegion) %>% count(worldRegion, hasDisease) %>% mutate(freq = n/sum(n))
##                    worldRegion hasDisease     n         freq
## 1         East Asia &\nPacific         No  1034 0.0433724832
## 2         East Asia &\nPacific        Yes   780 0.0327181208
## 3       Europe &\nCentral Asia         No  4532 0.1901006711
## 4       Europe &\nCentral Asia        Yes  1425 0.0597734899
## 5   Latin America &\nCaribbean         No    15 0.0006291946
## 6   Latin America &\nCaribbean        Yes    69 0.0028942953
## 7  Middle East &\nNorth Africa         No    11 0.0004614094
## 8                North America         No 12107 0.5078439597
## 9                North America        Yes  3841 0.1611157718
## 10                  South Asia         No    12 0.0005033557
## 11                  South Asia        Yes    14 0.0005872483
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(strictestGeography, finalDisease) %>% mutate(freq = n/sum(n))
##                strictestGeography      finalDisease   n         freq
## 1              Sub-Saharan Africa        Autoimmune 164 0.0537704918
## 2              Sub-Saharan Africa            Cancer   1 0.0003278689
## 3              Sub-Saharan Africa   Healthy control  66 0.0216393443
## 4              Sub-Saharan Africa        Infectious 155 0.0508196721
## 5              Sub-Saharan Africa     Mental health   8 0.0026229508
## 6              Sub-Saharan Africa         Metabolic   6 0.0019672131
## 7              Sub-Saharan Africa       Respiratory   6 0.0019672131
## 8  North Africa and\nWestern Asia   Healthy control   1 0.0003278689
## 9  North Africa and\nWestern Asia       Respiratory   2 0.0006557377
## 10                         Europe      Acute trauma   4 0.0013114754
## 11                         Europe        Autoimmune 380 0.1245901639
## 12                         Europe            Cancer 196 0.0642622951
## 13                         Europe    Cardiovascular  15 0.0049180328
## 14                         Europe  Gastrointestinal   2 0.0006557377
## 15                         Europe  Genetic syndrome  57 0.0186885246
## 16                         Europe   Healthy control 866 0.2839344262
## 17                         Europe        Infectious  52 0.0170491803
## 18                         Europe     Mental health  20 0.0065573770
## 19                         Europe Neurodegenerative  47 0.0154098361
## 20                         Europe      Neurological   6 0.0019672131
## 21                         Europe       Respiratory  89 0.0291803279
## 22                     South Asia        Autoimmune   9 0.0029508197
## 23                     South Asia    Cardiovascular   2 0.0006557377
## 24                     South Asia   Healthy control   2 0.0006557377
## 25                     South Asia     Integumentary   7 0.0022950820
## 26                     South Asia      Reproductive  10 0.0032786885
## 27                      East Asia      Acute trauma   8 0.0026229508
## 28                      East Asia        Autoimmune  62 0.0203278689
## 29                      East Asia             Blood  23 0.0075409836
## 30                      East Asia            Cancer 256 0.0839344262
## 31                      East Asia    Cardiovascular  27 0.0088524590
## 32                      East Asia  Gastrointestinal   5 0.0016393443
## 33                      East Asia   Healthy control  23 0.0075409836
## 34                      East Asia            Kidney  17 0.0055737705
## 35                      East Asia Neurodegenerative  10 0.0032786885
## 36                      East Asia             Other   7 0.0022950820
## 37                     Asia (NOS)        Autoimmune  33 0.0108196721
## 38                     Asia (NOS)            Cancer  75 0.0245901639
## 39                     Asia (NOS)         Endocrine  10 0.0032786885
## 40                     Asia (NOS)   Healthy control  19 0.0062295082
## 41                       Americas        Autoimmune   5 0.0016393443
## 42                       Americas             Blood   1 0.0003278689
## 43                       Americas            Cancer  23 0.0075409836
## 44                       Americas   Healthy control  38 0.0124590164
## 45                       Americas        Infectious  28 0.0091803279
## 46                       Americas Neurodegenerative  15 0.0049180328
## 47                       Multiple        Autoimmune  21 0.0068852459
## 48                       Multiple   Healthy control 119 0.0390163934
## 49                       Multiple        Infectious  40 0.0131147541
## 50                          Other        Infectious   1 0.0003278689
## 51                          Other         Metabolic   1 0.0003278689
## 52                          Other       Respiratory  10 0.0032786885
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(strictestRace, finalDisease) %>% mutate(freq = n/sum(n))
##                                  strictestRace      finalDisease   n         freq
## 1           American Indian and\nAlaska Native            Cancer   3 0.0009643202
## 2                                        Asian        Autoimmune  62 0.0199292832
## 3                                        Asian             Blood   4 0.0012857602
## 4                                        Asian            Cancer  65 0.0208936033
## 5                                        Asian   Healthy control  67 0.0215364834
## 6                                        Asian         Metabolic   1 0.0003214401
## 7                                        Asian      Reproductive   7 0.0022500804
## 8                                        Asian       Respiratory  20 0.0064288010
## 9                   Black or\nAfrican American        Autoimmune  10 0.0032144005
## 10                  Black or\nAfrican American             Blood   1 0.0003214401
## 11                  Black or\nAfrican American            Cancer 100 0.0321440051
## 12                  Black or\nAfrican American   Healthy control 136 0.0437158470
## 13                  Black or\nAfrican American        Infectious  25 0.0080360013
## 14                  Black or\nAfrican American     Mental health  70 0.0225008036
## 15                  Black or\nAfrican American         Metabolic   2 0.0006428801
## 16                  Black or\nAfrican American      Neurological   3 0.0009643202
## 17                                    Hispanic        Autoimmune  62 0.0199292832
## 18                                    Hispanic            Cancer  57 0.0183220829
## 19                                    Hispanic   Healthy control  61 0.0196078431
## 20                                    Hispanic        Infectious   4 0.0012857602
## 21                                    Hispanic     Mental health   5 0.0016072003
## 22                                    Hispanic         Metabolic   2 0.0006428801
## 23                                    Hispanic Neurodegenerative   3 0.0009643202
## 24                                    Hispanic      Neurological   1 0.0003214401
## 25                                    Hispanic             Other   2 0.0006428801
## 26                                    Multiple            Cancer  10 0.0032144005
## 27                                    Multiple   Healthy control  11 0.0035358406
## 28 Native Hawaiian and\nother Pacific Islander        Autoimmune   1 0.0003214401
## 29                                       Other        Autoimmune   1 0.0003214401
## 30                                       Other            Cancer   1 0.0003214401
## 31                                       Other   Healthy control   2 0.0006428801
## 32                                       White        Autoimmune 173 0.0556091289
## 33                                       White            Cancer 745 0.2394728383
## 34                                       White    Cardiovascular  13 0.0041787207
## 35                                       White   Healthy control 653 0.2099003536
## 36                                       White        Infectious  32 0.0102860816
## 37                                       White     Mental health 362 0.1163612986
## 38                                       White         Metabolic 222 0.0713596914
## 39                                       White Neurodegenerative 108 0.0347155256
## 40                                       White      Neurological   4 0.0012857602
allSRAFinal %>% drop_na(worldRegion, finalDisease) %>% count(worldRegion, finalDisease) %>% mutate(freq = n/sum(n)) 
##                   worldRegion      finalDisease    n         freq
## 1        East Asia &\nPacific      Acute trauma   12 0.0019579050
## 2        East Asia &\nPacific        Autoimmune   98 0.0159895578
## 3        East Asia &\nPacific             Blood   27 0.0044052863
## 4        East Asia &\nPacific            Cancer  380 0.0620003263
## 5        East Asia &\nPacific    Cardiovascular   27 0.0044052863
## 6        East Asia &\nPacific         Endocrine   10 0.0016315875
## 7        East Asia &\nPacific  Gastrointestinal    5 0.0008157938
## 8        East Asia &\nPacific   Healthy control  122 0.0199053679
## 9        East Asia &\nPacific     Integumentary    7 0.0011421113
## 10       East Asia &\nPacific            Kidney   17 0.0027736988
## 11       East Asia &\nPacific     Mental health    8 0.0013052700
## 12       East Asia &\nPacific Neurodegenerative   20 0.0032631751
## 13       East Asia &\nPacific      Neurological    6 0.0009789525
## 14       East Asia &\nPacific             Other    9 0.0014684288
## 15       East Asia &\nPacific      Reproductive    7 0.0011421113
## 16       East Asia &\nPacific       Respiratory   25 0.0040789688
## 17     Europe &\nCentral Asia        Autoimmune   32 0.0052210801
## 18     Europe &\nCentral Asia            Cancer  208 0.0339370207
## 19     Europe &\nCentral Asia    Cardiovascular   15 0.0024473813
## 20     Europe &\nCentral Asia  Genetic syndrome   45 0.0073421439
## 21     Europe &\nCentral Asia   Healthy control  834 0.1360744004
## 22     Europe &\nCentral Asia        Infectious  208 0.0339370207
## 23     Europe &\nCentral Asia         Metabolic   16 0.0026105401
## 24     Europe &\nCentral Asia Neurodegenerative   19 0.0031000163
## 25     Europe &\nCentral Asia       Respiratory   48 0.0078316202
## 26 Latin America &\nCaribbean        Autoimmune    1 0.0001631588
## 27 Latin America &\nCaribbean             Blood    1 0.0001631588
## 28 Latin America &\nCaribbean            Cancer   23 0.0037526513
## 29 Latin America &\nCaribbean   Healthy control   27 0.0044052863
## 30 Latin America &\nCaribbean        Infectious    2 0.0003263175
## 31 Latin America &\nCaribbean Neurodegenerative   15 0.0024473813
## 32              North America        Autoimmune  852 0.1390112580
## 33              North America             Blood    1 0.0001631588
## 34              North America            Cancer  907 0.1479849894
## 35              North America    Cardiovascular   13 0.0021210638
## 36              North America  Gastrointestinal    2 0.0003263175
## 37              North America  Genetic syndrome   12 0.0019579050
## 38              North America   Healthy control 1061 0.1731114374
## 39              North America        Infectious  127 0.0207211617
## 40              North America     Mental health  457 0.0745635503
## 41              North America         Metabolic  218 0.0355686083
## 42              North America Neurodegenerative  129 0.0210474792
## 43              North America      Neurological    8 0.0013052700
## 44              North America       Respiratory   54 0.0088105727
## 45                 South Asia    Cardiovascular    2 0.0003263175
## 46                 South Asia   Healthy control    2 0.0003263175
## 47                 South Asia      Reproductive   10 0.0016315875
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(finalDisease, strictestGeography) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##         finalDisease             strictestGeography   n         freq  diseaseFreq
## 1       Acute trauma                         Europe   4 0.0013114754 0.3333333333
## 2       Acute trauma                      East Asia   8 0.0026229508 0.6666666667
## 3         Autoimmune             Sub-Saharan Africa 164 0.0537704918 0.2433234421
## 4         Autoimmune                         Europe 380 0.1245901639 0.5637982196
## 5         Autoimmune                     South Asia   9 0.0029508197 0.0133531157
## 6         Autoimmune                      East Asia  62 0.0203278689 0.0919881306
## 7         Autoimmune                     Asia (NOS)  33 0.0108196721 0.0489614243
## 8         Autoimmune                       Americas   5 0.0016393443 0.0074183976
## 9         Autoimmune                       Multiple  21 0.0068852459 0.0311572700
## 10             Blood                      East Asia  23 0.0075409836 0.9583333333
## 11             Blood                       Americas   1 0.0003278689 0.0416666667
## 12            Cancer             Sub-Saharan Africa   1 0.0003278689 0.0018148820
## 13            Cancer                         Europe 196 0.0642622951 0.3557168784
## 14            Cancer                      East Asia 256 0.0839344262 0.4646098004
## 15            Cancer                     Asia (NOS)  75 0.0245901639 0.1361161525
## 16            Cancer                       Americas  23 0.0075409836 0.0417422868
## 17    Cardiovascular                         Europe  15 0.0049180328 0.3409090909
## 18    Cardiovascular                     South Asia   2 0.0006557377 0.0454545455
## 19    Cardiovascular                      East Asia  27 0.0088524590 0.6136363636
## 20         Endocrine                     Asia (NOS)  10 0.0032786885 1.0000000000
## 21  Gastrointestinal                         Europe   2 0.0006557377 0.2857142857
## 22  Gastrointestinal                      East Asia   5 0.0016393443 0.7142857143
## 23  Genetic syndrome                         Europe  57 0.0186885246 1.0000000000
## 24   Healthy control             Sub-Saharan Africa  66 0.0216393443 0.0582010582
## 25   Healthy control North Africa and\nWestern Asia   1 0.0003278689 0.0008818342
## 26   Healthy control                         Europe 866 0.2839344262 0.7636684303
## 27   Healthy control                     South Asia   2 0.0006557377 0.0017636684
## 28   Healthy control                      East Asia  23 0.0075409836 0.0202821869
## 29   Healthy control                     Asia (NOS)  19 0.0062295082 0.0167548501
## 30   Healthy control                       Americas  38 0.0124590164 0.0335097002
## 31   Healthy control                       Multiple 119 0.0390163934 0.1049382716
## 32        Infectious             Sub-Saharan Africa 155 0.0508196721 0.5615942029
## 33        Infectious                         Europe  52 0.0170491803 0.1884057971
## 34        Infectious                       Americas  28 0.0091803279 0.1014492754
## 35        Infectious                       Multiple  40 0.0131147541 0.1449275362
## 36        Infectious                          Other   1 0.0003278689 0.0036231884
## 37     Integumentary                     South Asia   7 0.0022950820 1.0000000000
## 38            Kidney                      East Asia  17 0.0055737705 1.0000000000
## 39     Mental health             Sub-Saharan Africa   8 0.0026229508 0.2857142857
## 40     Mental health                         Europe  20 0.0065573770 0.7142857143
## 41         Metabolic             Sub-Saharan Africa   6 0.0019672131 0.8571428571
## 42         Metabolic                          Other   1 0.0003278689 0.1428571429
## 43 Neurodegenerative                         Europe  47 0.0154098361 0.6527777778
## 44 Neurodegenerative                      East Asia  10 0.0032786885 0.1388888889
## 45 Neurodegenerative                       Americas  15 0.0049180328 0.2083333333
## 46      Neurological                         Europe   6 0.0019672131 1.0000000000
## 47             Other                      East Asia   7 0.0022950820 1.0000000000
## 48      Reproductive                     South Asia  10 0.0032786885 1.0000000000
## 49       Respiratory             Sub-Saharan Africa   6 0.0019672131 0.0560747664
## 50       Respiratory North Africa and\nWestern Asia   2 0.0006557377 0.0186915888
## 51       Respiratory                         Europe  89 0.0291803279 0.8317757009
## 52       Respiratory                          Other  10 0.0032786885 0.0934579439
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(finalDisease, strictestRace) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##         finalDisease                               strictestRace   n         freq diseaseFreq
## 1         Autoimmune                                       Asian  62 0.0199292832 0.200647249
## 2         Autoimmune                  Black or\nAfrican American  10 0.0032144005 0.032362460
## 3         Autoimmune                                    Hispanic  62 0.0199292832 0.200647249
## 4         Autoimmune Native Hawaiian and\nother Pacific Islander   1 0.0003214401 0.003236246
## 5         Autoimmune                                       Other   1 0.0003214401 0.003236246
## 6         Autoimmune                                       White 173 0.0556091289 0.559870550
## 7              Blood                                       Asian   4 0.0012857602 0.800000000
## 8              Blood                  Black or\nAfrican American   1 0.0003214401 0.200000000
## 9             Cancer          American Indian and\nAlaska Native   3 0.0009643202 0.003058104
## 10            Cancer                                       Asian  65 0.0208936033 0.066258919
## 11            Cancer                  Black or\nAfrican American 100 0.0321440051 0.101936799
## 12            Cancer                                    Hispanic  57 0.0183220829 0.058103976
## 13            Cancer                                    Multiple  10 0.0032144005 0.010193680
## 14            Cancer                                       Other   1 0.0003214401 0.001019368
## 15            Cancer                                       White 745 0.2394728383 0.759429154
## 16    Cardiovascular                                       White  13 0.0041787207 1.000000000
## 17   Healthy control                                       Asian  67 0.0215364834 0.072043011
## 18   Healthy control                  Black or\nAfrican American 136 0.0437158470 0.146236559
## 19   Healthy control                                    Hispanic  61 0.0196078431 0.065591398
## 20   Healthy control                                    Multiple  11 0.0035358406 0.011827957
## 21   Healthy control                                       Other   2 0.0006428801 0.002150538
## 22   Healthy control                                       White 653 0.2099003536 0.702150538
## 23        Infectious                  Black or\nAfrican American  25 0.0080360013 0.409836066
## 24        Infectious                                    Hispanic   4 0.0012857602 0.065573770
## 25        Infectious                                       White  32 0.0102860816 0.524590164
## 26     Mental health                  Black or\nAfrican American  70 0.0225008036 0.160183066
## 27     Mental health                                    Hispanic   5 0.0016072003 0.011441648
## 28     Mental health                                       White 362 0.1163612986 0.828375286
## 29         Metabolic                                       Asian   1 0.0003214401 0.004405286
## 30         Metabolic                  Black or\nAfrican American   2 0.0006428801 0.008810573
## 31         Metabolic                                    Hispanic   2 0.0006428801 0.008810573
## 32         Metabolic                                       White 222 0.0713596914 0.977973568
## 33 Neurodegenerative                                    Hispanic   3 0.0009643202 0.027027027
## 34 Neurodegenerative                                       White 108 0.0347155256 0.972972973
## 35      Neurological                  Black or\nAfrican American   3 0.0009643202 0.375000000
## 36      Neurological                                    Hispanic   1 0.0003214401 0.125000000
## 37      Neurological                                       White   4 0.0012857602 0.500000000
## 38             Other                                    Hispanic   2 0.0006428801 1.000000000
## 39      Reproductive                                       Asian   7 0.0022500804 1.000000000
## 40       Respiratory                                       Asian  20 0.0064288010 1.000000000
allSRAFinal %>% drop_na(worldRegion, finalDisease) %>% count(finalDisease, worldRegion) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##         finalDisease                worldRegion    n         freq  diseaseFreq
## 1       Acute trauma       East Asia &\nPacific   12 0.0019579050 1.0000000000
## 2         Autoimmune       East Asia &\nPacific   98 0.0159895578 0.0996948118
## 3         Autoimmune     Europe &\nCentral Asia   32 0.0052210801 0.0325534079
## 4         Autoimmune Latin America &\nCaribbean    1 0.0001631588 0.0010172940
## 5         Autoimmune              North America  852 0.1390112580 0.8667344863
## 6              Blood       East Asia &\nPacific   27 0.0044052863 0.9310344828
## 7              Blood Latin America &\nCaribbean    1 0.0001631588 0.0344827586
## 8              Blood              North America    1 0.0001631588 0.0344827586
## 9             Cancer       East Asia &\nPacific  380 0.0620003263 0.2503293808
## 10            Cancer     Europe &\nCentral Asia  208 0.0339370207 0.1370223979
## 11            Cancer Latin America &\nCaribbean   23 0.0037526513 0.0151515152
## 12            Cancer              North America  907 0.1479849894 0.5974967062
## 13    Cardiovascular       East Asia &\nPacific   27 0.0044052863 0.4736842105
## 14    Cardiovascular     Europe &\nCentral Asia   15 0.0024473813 0.2631578947
## 15    Cardiovascular              North America   13 0.0021210638 0.2280701754
## 16    Cardiovascular                 South Asia    2 0.0003263175 0.0350877193
## 17         Endocrine       East Asia &\nPacific   10 0.0016315875 1.0000000000
## 18  Gastrointestinal       East Asia &\nPacific    5 0.0008157938 0.7142857143
## 19  Gastrointestinal              North America    2 0.0003263175 0.2857142857
## 20  Genetic syndrome     Europe &\nCentral Asia   45 0.0073421439 0.7894736842
## 21  Genetic syndrome              North America   12 0.0019579050 0.2105263158
## 22   Healthy control       East Asia &\nPacific  122 0.0199053679 0.0596285435
## 23   Healthy control     Europe &\nCentral Asia  834 0.1360744004 0.4076246334
## 24   Healthy control Latin America &\nCaribbean   27 0.0044052863 0.0131964809
## 25   Healthy control              North America 1061 0.1731114374 0.5185728250
## 26   Healthy control                 South Asia    2 0.0003263175 0.0009775171
## 27        Infectious     Europe &\nCentral Asia  208 0.0339370207 0.6172106825
## 28        Infectious Latin America &\nCaribbean    2 0.0003263175 0.0059347181
## 29        Infectious              North America  127 0.0207211617 0.3768545994
## 30     Integumentary       East Asia &\nPacific    7 0.0011421113 1.0000000000
## 31            Kidney       East Asia &\nPacific   17 0.0027736988 1.0000000000
## 32     Mental health       East Asia &\nPacific    8 0.0013052700 0.0172043011
## 33     Mental health              North America  457 0.0745635503 0.9827956989
## 34         Metabolic     Europe &\nCentral Asia   16 0.0026105401 0.0683760684
## 35         Metabolic              North America  218 0.0355686083 0.9316239316
## 36 Neurodegenerative       East Asia &\nPacific   20 0.0032631751 0.1092896175
## 37 Neurodegenerative     Europe &\nCentral Asia   19 0.0031000163 0.1038251366
## 38 Neurodegenerative Latin America &\nCaribbean   15 0.0024473813 0.0819672131
## 39 Neurodegenerative              North America  129 0.0210474792 0.7049180328
## 40      Neurological       East Asia &\nPacific    6 0.0009789525 0.4285714286
## 41      Neurological              North America    8 0.0013052700 0.5714285714
## 42             Other       East Asia &\nPacific    9 0.0014684288 1.0000000000
## 43      Reproductive       East Asia &\nPacific    7 0.0011421113 0.4117647059
## 44      Reproductive                 South Asia   10 0.0016315875 0.5882352941
## 45       Respiratory       East Asia &\nPacific   25 0.0040789688 0.1968503937
## 46       Respiratory     Europe &\nCentral Asia   48 0.0078316202 0.3779527559
## 47       Respiratory              North America   54 0.0088105727 0.4251968504
# And here comes the awfulness... disease by tissue and the rest:
allSRAFinal %>% drop_na(strictestGeography, finalDisease) %>% count(finalDisease, finalOrgan, strictestGeography) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##          finalDisease           finalOrgan             strictestGeography   n         freq  diseaseFreq
## 1        Acute trauma         Blood vessel                         Europe   3 0.0009836066 0.2500000000
## 2        Acute trauma         Blood vessel                      East Asia   8 0.0026229508 0.6666666667
## 3        Acute trauma                Heart                         Europe   1 0.0003278689 0.0833333333
## 4          Autoimmune                Blood             Sub-Saharan Africa  35 0.0114754098 0.0519287834
## 5          Autoimmune                Blood                         Europe   8 0.0026229508 0.0118694362
## 6          Autoimmune          Bone marrow                       Americas   1 0.0003278689 0.0014836795
## 7          Autoimmune            Intestine             Sub-Saharan Africa 127 0.0416393443 0.1884272997
## 8          Autoimmune            Intestine                         Europe 290 0.0950819672 0.4302670623
## 9          Autoimmune            Intestine                     South Asia   5 0.0016393443 0.0074183976
## 10         Autoimmune            Intestine                     Asia (NOS)   3 0.0009836066 0.0044510386
## 11         Autoimmune            Intestine                       Multiple  21 0.0068852459 0.0311572700
## 12         Autoimmune                Joint             Sub-Saharan Africa   2 0.0006557377 0.0029673591
## 13         Autoimmune                Joint                         Europe  50 0.0163934426 0.0741839763
## 14         Autoimmune                Joint                     South Asia   4 0.0013114754 0.0059347181
## 15         Autoimmune                Joint                      East Asia   2 0.0006557377 0.0029673591
## 16         Autoimmune                Joint                     Asia (NOS)  30 0.0098360656 0.0445103858
## 17         Autoimmune                Joint                       Americas   4 0.0013114754 0.0059347181
## 18         Autoimmune                 Skin                         Europe  18 0.0059016393 0.0267062315
## 19         Autoimmune                 Skin                      East Asia  60 0.0196721311 0.0890207715
## 20         Autoimmune              Thyroid                         Europe  14 0.0045901639 0.0207715134
## 21              Blood          Bone marrow                      East Asia  23 0.0075409836 0.9583333333
## 22              Blood          Bone marrow                       Americas   1 0.0003278689 0.0416666667
## 23             Cancer              Bladder                      East Asia  10 0.0032786885 0.0181488203
## 24             Cancer         Blood vessel                         Europe  24 0.0078688525 0.0435571688
## 25             Cancer          Bone marrow                      East Asia  20 0.0065573770 0.0362976407
## 26             Cancer          Bone marrow                       Americas  23 0.0075409836 0.0417422868
## 27             Cancer               Breast                         Europe  30 0.0098360656 0.0544464610
## 28             Cancer Cancer sample\n(NOS)             Sub-Saharan Africa   1 0.0003278689 0.0018148820
## 29             Cancer Cancer sample\n(NOS)                         Europe  14 0.0045901639 0.0254083485
## 30             Cancer Cancer sample\n(NOS)                     Asia (NOS)   2 0.0006557377 0.0036297641
## 31             Cancer                 IPSC                     Asia (NOS)  39 0.0127868852 0.0707803993
## 32             Cancer            Intestine                         Europe   4 0.0013114754 0.0072595281
## 33             Cancer            Intestine                      East Asia 208 0.0681967213 0.3774954628
## 34             Cancer                Liver                      East Asia  18 0.0059016393 0.0326678766
## 35             Cancer                Liver                     Asia (NOS)  34 0.0111475410 0.0617059891
## 36             Cancer                 Lung                         Europe   1 0.0003278689 0.0018148820
## 37             Cancer             Prostate                         Europe  94 0.0308196721 0.1705989111
## 38             Cancer                 Skin                         Europe   9 0.0029508197 0.0163339383
## 39             Cancer                 <NA>                         Europe  20 0.0065573770 0.0362976407
## 40     Cardiovascular                Blood                      East Asia   6 0.0019672131 0.1363636364
## 41     Cardiovascular         Blood vessel                      East Asia  21 0.0068852459 0.4772727273
## 42     Cardiovascular                Heart                         Europe  15 0.0049180328 0.3409090909
## 43     Cardiovascular               Testis                     South Asia   2 0.0006557377 0.0454545455
## 44          Endocrine              Thyroid                     Asia (NOS)  10 0.0032786885 1.0000000000
## 45   Gastrointestinal                Blood                      East Asia   5 0.0016393443 0.7142857143
## 46   Gastrointestinal            Intestine                         Europe   2 0.0006557377 0.2857142857
## 47   Genetic syndrome                Blood                         Europe  38 0.0124590164 0.6666666667
## 48   Genetic syndrome                 IPSC                         Europe  12 0.0039344262 0.2105263158
## 49   Genetic syndrome                 Nose                         Europe   7 0.0022950820 0.1228070175
## 50    Healthy control                Blood             Sub-Saharan Africa  27 0.0088524590 0.0238095238
## 51    Healthy control                Blood                         Europe  73 0.0239344262 0.0643738977
## 52    Healthy control                Blood                      East Asia  22 0.0072131148 0.0194003527
## 53    Healthy control                Blood                     Asia (NOS)   1 0.0003278689 0.0008818342
## 54    Healthy control                Blood                       Americas  11 0.0036065574 0.0097001764
## 55    Healthy control                Blood                       Multiple 113 0.0370491803 0.0996472663
## 56    Healthy control         Blood vessel             Sub-Saharan Africa   8 0.0026229508 0.0070546737
## 57    Healthy control         Blood vessel North Africa and\nWestern Asia   1 0.0003278689 0.0008818342
## 58    Healthy control         Blood vessel                         Europe  12 0.0039344262 0.0105820106
## 59    Healthy control         Blood vessel                      East Asia   1 0.0003278689 0.0008818342
## 60    Healthy control          Bone marrow                         Europe  12 0.0039344262 0.0105820106
## 61    Healthy control                Brain             Sub-Saharan Africa   8 0.0026229508 0.0070546737
## 62    Healthy control                Brain                         Europe  18 0.0059016393 0.0158730159
## 63    Healthy control                 IPSC                         Europe 610 0.2000000000 0.5379188713
## 64    Healthy control                 IPSC                     South Asia   2 0.0006557377 0.0017636684
## 65    Healthy control                 IPSC                     Asia (NOS)  15 0.0049180328 0.0132275132
## 66    Healthy control                 IPSC                       Americas  27 0.0088524590 0.0238095238
## 67    Healthy control            Intestine             Sub-Saharan Africa  23 0.0075409836 0.0202821869
## 68    Healthy control            Intestine                         Europe  45 0.0147540984 0.0396825397
## 69    Healthy control            Intestine                     Asia (NOS)   2 0.0006557377 0.0017636684
## 70    Healthy control            Intestine                       Multiple   6 0.0019672131 0.0052910053
## 71    Healthy control                 Lung                         Europe  12 0.0039344262 0.0105820106
## 72    Healthy control               Muscle                         Europe  16 0.0052459016 0.0141093474
## 73    Healthy control                 Nose                         Europe  28 0.0091803279 0.0246913580
## 74    Healthy control                 Skin                         Europe  11 0.0036065574 0.0097001764
## 75    Healthy control               Testis                         Europe   2 0.0006557377 0.0017636684
## 76    Healthy control               Testis                     Asia (NOS)   1 0.0003278689 0.0008818342
## 77    Healthy control              Thyroid                         Europe   5 0.0016393443 0.0044091711
## 78    Healthy control              Trachea                         Europe  12 0.0039344262 0.0105820106
## 79    Healthy control                 <NA>                         Europe  10 0.0032786885 0.0088183422
## 80         Infectious                Blood             Sub-Saharan Africa 151 0.0495081967 0.5471014493
## 81         Infectious                Blood                         Europe  44 0.0144262295 0.1594202899
## 82         Infectious                Blood                       Americas  26 0.0085245902 0.0942028986
## 83         Infectious                Blood                       Multiple  40 0.0131147541 0.1449275362
## 84         Infectious                Blood                          Other   1 0.0003278689 0.0036231884
## 85         Infectious          Bone marrow                       Americas   2 0.0006557377 0.0072463768
## 86         Infectious                 Lung             Sub-Saharan Africa   4 0.0013114754 0.0144927536
## 87         Infectious                 <NA>                         Europe   8 0.0026229508 0.0289855072
## 88      Integumentary                Blood                     South Asia   7 0.0022950820 1.0000000000
## 89             Kidney                Blood                      East Asia  17 0.0055737705 1.0000000000
## 90      Mental health                Brain             Sub-Saharan Africa   8 0.0026229508 0.2857142857
## 91      Mental health                 IPSC                         Europe  20 0.0065573770 0.7142857143
## 92          Metabolic                Blood             Sub-Saharan Africa   6 0.0019672131 0.8571428571
## 93          Metabolic                Blood                          Other   1 0.0003278689 0.1428571429
## 94  Neurodegenerative                Blood                      East Asia  10 0.0032786885 0.1388888889
## 95  Neurodegenerative                Brain                         Europe  10 0.0032786885 0.1388888889
## 96  Neurodegenerative                  CNS                         Europe   7 0.0022950820 0.0972222222
## 97  Neurodegenerative                 IPSC                         Europe  12 0.0039344262 0.1666666667
## 98  Neurodegenerative                 IPSC                       Americas  15 0.0049180328 0.2083333333
## 99  Neurodegenerative                 <NA>                         Europe  18 0.0059016393 0.2500000000
## 100      Neurological                 Skin                         Europe   6 0.0019672131 1.0000000000
## 101             Other                Blood                      East Asia   7 0.0022950820 1.0000000000
## 102      Reproductive               Testis                     South Asia  10 0.0032786885 1.0000000000
## 103       Respiratory                Blood                         Europe  74 0.0242622951 0.6915887850
## 104       Respiratory                Blood                          Other  10 0.0032786885 0.0934579439
## 105       Respiratory         Blood vessel North Africa and\nWestern Asia   2 0.0006557377 0.0186915888
## 106       Respiratory         Blood vessel                         Europe   3 0.0009836066 0.0280373832
## 107       Respiratory                 Lung             Sub-Saharan Africa   6 0.0019672131 0.0560747664
## 108       Respiratory                 Lung                         Europe  12 0.0039344262 0.1121495327
allSRAFinal %>% drop_na(strictestRace, finalDisease) %>% count(finalDisease, finalOrgan, strictestRace) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##         finalDisease           finalOrgan                               strictestRace   n         freq diseaseFreq
## 1         Autoimmune                Blood                                       Asian  36 0.0115718419 0.116504854
## 2         Autoimmune                Blood                  Black or\nAfrican American  10 0.0032144005 0.032362460
## 3         Autoimmune                Blood                                    Hispanic  59 0.0189649630 0.190938511
## 4         Autoimmune                Blood Native Hawaiian and\nother Pacific Islander   1 0.0003214401 0.003236246
## 5         Autoimmune                Blood                                       Other   1 0.0003214401 0.003236246
## 6         Autoimmune                Blood                                       White 173 0.0556091289 0.559870550
## 7         Autoimmune            Intestine                                       Asian  26 0.0083574413 0.084142395
## 8         Autoimmune            Intestine                                    Hispanic   3 0.0009643202 0.009708738
## 9              Blood                Blood                                       Asian   4 0.0012857602 0.800000000
## 10             Blood                Blood                  Black or\nAfrican American   1 0.0003214401 0.200000000
## 11            Cancer                Blood                                       White   3 0.0009643202 0.003058104
## 12            Cancer          Bone marrow                                       Asian   1 0.0003214401 0.001019368
## 13            Cancer          Bone marrow                                    Hispanic   1 0.0003214401 0.001019368
## 14            Cancer          Bone marrow                                       White   8 0.0025715204 0.008154944
## 15            Cancer               Breast                                       Asian  36 0.0115718419 0.036697248
## 16            Cancer               Breast                  Black or\nAfrican American  40 0.0128576021 0.040774720
## 17            Cancer               Breast                                    Hispanic   1 0.0003214401 0.001019368
## 18            Cancer               Breast                                       White  41 0.0131790421 0.041794088
## 19            Cancer Cancer sample\n(NOS)          American Indian and\nAlaska Native   3 0.0009643202 0.003058104
## 20            Cancer Cancer sample\n(NOS)                                       Asian  28 0.0090003214 0.028542304
## 21            Cancer Cancer sample\n(NOS)                  Black or\nAfrican American  58 0.0186435230 0.059123344
## 22            Cancer Cancer sample\n(NOS)                                    Hispanic  14 0.0045001607 0.014271152
## 23            Cancer Cancer sample\n(NOS)                                    Multiple  10 0.0032144005 0.010193680
## 24            Cancer Cancer sample\n(NOS)                                       Other   1 0.0003214401 0.001019368
## 25            Cancer Cancer sample\n(NOS)                                       White 606 0.1947926712 0.617737003
## 26            Cancer               Larynx                                       White   1 0.0003214401 0.001019368
## 27            Cancer                 Lung                                       White   9 0.0028929605 0.009174312
## 28            Cancer           Lymph node                                       White   2 0.0006428801 0.002038736
## 29            Cancer          Oral cavity                  Black or\nAfrican American   1 0.0003214401 0.001019368
## 30            Cancer          Oral cavity                                       White  16 0.0051430408 0.016309888
## 31            Cancer             Prostate                                       White  48 0.0154291225 0.048929664
## 32            Cancer                 Skin                                       White   6 0.0019286403 0.006116208
## 33            Cancer              Stomach                                    Hispanic  41 0.0131790421 0.041794088
## 34            Cancer               Tonsil                  Black or\nAfrican American   1 0.0003214401 0.001019368
## 35            Cancer               Tonsil                                       White   5 0.0016072003 0.005096840
## 36    Cardiovascular                Blood                                       White  12 0.0038572806 0.923076923
## 37    Cardiovascular                Liver                                       White   1 0.0003214401 0.076923077
## 38   Healthy control              Adipose                                    Multiple   1 0.0003214401 0.001075269
## 39   Healthy control              Bladder                                    Multiple   1 0.0003214401 0.001075269
## 40   Healthy control                Blood                                       Asian  64 0.0205721633 0.068817204
## 41   Healthy control                Blood                  Black or\nAfrican American  48 0.0154291225 0.051612903
## 42   Healthy control                Blood                                    Hispanic  32 0.0102860816 0.034408602
## 43   Healthy control                Blood                                       Other   2 0.0006428801 0.002150538
## 44   Healthy control                Blood                                       White 282 0.0906460945 0.303225806
## 45   Healthy control                Brain                  Black or\nAfrican American  86 0.0276438444 0.092473118
## 46   Healthy control                Brain                                    Hispanic  19 0.0061073610 0.020430108
## 47   Healthy control                Brain                                       White 186 0.0597878496 0.200000000
## 48   Healthy control                Heart                                    Multiple   2 0.0006428801 0.002150538
## 49   Healthy control                Heart                                       White   1 0.0003214401 0.001075269
## 50   Healthy control                 IPSC                                       White 169 0.0543233687 0.181720430
## 51   Healthy control            Intestine                                    Hispanic   2 0.0006428801 0.002150538
## 52   Healthy control            Intestine                                    Multiple   2 0.0006428801 0.002150538
## 53   Healthy control                 Lung                                    Hispanic   6 0.0019286403 0.006451613
## 54   Healthy control                 Lung                                    Multiple   1 0.0003214401 0.001075269
## 55   Healthy control               Muscle                                    Multiple   1 0.0003214401 0.001075269
## 56   Healthy control                 Skin                                       White   1 0.0003214401 0.001075269
## 57   Healthy control               Spleen                                    Multiple   1 0.0003214401 0.001075269
## 58   Healthy control              Stomach                                    Multiple   1 0.0003214401 0.001075269
## 59   Healthy control               Thymus                                    Multiple   1 0.0003214401 0.001075269
## 60   Healthy control               Uterus                                       Asian   3 0.0009643202 0.003225806
## 61   Healthy control                 <NA>                  Black or\nAfrican American   2 0.0006428801 0.002150538
## 62   Healthy control                 <NA>                                    Hispanic   2 0.0006428801 0.002150538
## 63   Healthy control                 <NA>                                       White  14 0.0045001607 0.015053763
## 64        Infectious                Blood                  Black or\nAfrican American  25 0.0080360013 0.409836066
## 65        Infectious                Blood                                    Hispanic   4 0.0012857602 0.065573770
## 66        Infectious                Blood                                       White  32 0.0102860816 0.524590164
## 67     Mental health              Adipose                                       White   2 0.0006428801 0.004576659
## 68     Mental health        Adrenal gland                                       White   2 0.0006428801 0.004576659
## 69     Mental health         Blood vessel                                       White   2 0.0006428801 0.004576659
## 70     Mental health                Brain                  Black or\nAfrican American  70 0.0225008036 0.160183066
## 71     Mental health                Brain                                    Hispanic   5 0.0016072003 0.011441648
## 72     Mental health                Brain                                       White 198 0.0636451302 0.453089245
## 73     Mental health      Digestive tract                                       White   2 0.0006428801 0.004576659
## 74     Mental health                Heart                                       White   3 0.0009643202 0.006864989
## 75     Mental health                 IPSC                                       White 135 0.0433944069 0.308924485
## 76     Mental health            Intestine                                       White   4 0.0012857602 0.009153318
## 77     Mental health                 Lung                                       White   1 0.0003214401 0.002288330
## 78     Mental health               Muscle                                       White   2 0.0006428801 0.004576659
## 79     Mental health                Ovary                                       White   1 0.0003214401 0.002288330
## 80     Mental health             Pancreas                                       White   2 0.0006428801 0.004576659
## 81     Mental health               Spleen                                       White   2 0.0006428801 0.004576659
## 82     Mental health              Stomach                                       White   2 0.0006428801 0.004576659
## 83     Mental health                 <NA>                                       White   4 0.0012857602 0.009153318
## 84         Metabolic        Adrenal gland                                       White   1 0.0003214401 0.004405286
## 85         Metabolic                 IPSC                                    Hispanic   2 0.0006428801 0.008810573
## 86         Metabolic                 IPSC                                       White  22 0.0070716811 0.096916300
## 87         Metabolic               Kidney                                       White   2 0.0006428801 0.008810573
## 88         Metabolic                Liver                                       White 191 0.0613950498 0.841409692
## 89         Metabolic                 Skin                                       Asian   1 0.0003214401 0.004405286
## 90         Metabolic                 Skin                  Black or\nAfrican American   2 0.0006428801 0.008810573
## 91         Metabolic                 Skin                                       White   6 0.0019286403 0.026431718
## 92 Neurodegenerative                Brain                                    Hispanic   3 0.0009643202 0.027027027
## 93 Neurodegenerative                Brain                                       White 108 0.0347155256 0.972972973
## 94      Neurological                Brain                  Black or\nAfrican American   3 0.0009643202 0.375000000
## 95      Neurological                Brain                                    Hispanic   1 0.0003214401 0.125000000
## 96      Neurological                Brain                                       White   4 0.0012857602 0.500000000
## 97             Other         Blood vessel                                    Hispanic   2 0.0006428801 1.000000000
## 98      Reproductive               Uterus                                       Asian   7 0.0022500804 1.000000000
## 99       Respiratory                Blood                                       Asian  20 0.0064288010 1.000000000
allSRAFinal %>% drop_na(worldRegion, finalDisease) %>% count(finalDisease, finalOrgan, worldRegion) %>% mutate(freq = n/sum(n)) %>% group_by(finalDisease) %>% mutate(diseaseFreq = n/sum(n)) %>% ungroup() %>% as.data.frame()
##          finalDisease           finalOrgan                worldRegion   n         freq  diseaseFreq
## 1        Acute trauma         Blood vessel       East Asia &\nPacific  11 0.0017947463 0.9166666667
## 2        Acute trauma                Heart       East Asia &\nPacific   1 0.0001631588 0.0833333333
## 3          Autoimmune                Blood       East Asia &\nPacific  12 0.0019579050 0.0122075280
## 4          Autoimmune                Blood              North America 311 0.0507423723 0.3163784334
## 5          Autoimmune          Bone marrow Latin America &\nCaribbean   1 0.0001631588 0.0010172940
## 6          Autoimmune            Intestine       East Asia &\nPacific  26 0.0042421276 0.0264496439
## 7          Autoimmune            Intestine              North America 449 0.0732582803 0.4567650051
## 8          Autoimmune                Joint              North America  92 0.0150106053 0.0935910478
## 9          Autoimmune                 Skin       East Asia &\nPacific  60 0.0097895252 0.0610376399
## 10         Autoimmune                 Skin     Europe &\nCentral Asia  18 0.0029368576 0.0183112920
## 11         Autoimmune              Thyroid     Europe &\nCentral Asia  14 0.0022842225 0.0142421160
## 12              Blood                Blood       East Asia &\nPacific   4 0.0006526350 0.1379310345
## 13              Blood                Blood              North America   1 0.0001631588 0.0344827586
## 14              Blood          Bone marrow       East Asia &\nPacific  23 0.0037526513 0.7931034483
## 15              Blood          Bone marrow Latin America &\nCaribbean   1 0.0001631588 0.0344827586
## 16             Cancer              Bladder       East Asia &\nPacific  10 0.0016315875 0.0065876153
## 17             Cancer                Blood              North America   3 0.0004894763 0.0019762846
## 18             Cancer         Blood vessel     Europe &\nCentral Asia  24 0.0039158101 0.0158102767
## 19             Cancer          Bone marrow       East Asia &\nPacific  20 0.0032631751 0.0131752306
## 20             Cancer          Bone marrow Latin America &\nCaribbean  23 0.0037526513 0.0151515152
## 21             Cancer          Bone marrow              North America  10 0.0016315875 0.0065876153
## 22             Cancer               Breast       East Asia &\nPacific  36 0.0058737151 0.0237154150
## 23             Cancer               Breast     Europe &\nCentral Asia  30 0.0048947626 0.0197628458
## 24             Cancer               Breast              North America  82 0.0133790178 0.0540184453
## 25             Cancer Cancer sample\n(NOS)       East Asia &\nPacific  15 0.0024473813 0.0098814229
## 26             Cancer Cancer sample\n(NOS)              North America 708 0.1155163975 0.4664031621
## 27             Cancer                 IPSC       East Asia &\nPacific  39 0.0063631914 0.0256916996
## 28             Cancer            Intestine       East Asia &\nPacific 208 0.0339370207 0.1370223979
## 29             Cancer            Intestine              North America   4 0.0006526350 0.0026350461
## 30             Cancer               Larynx              North America   1 0.0001631588 0.0006587615
## 31             Cancer                Liver       East Asia &\nPacific  52 0.0084842552 0.0342555995
## 32             Cancer                 Lung     Europe &\nCentral Asia   1 0.0001631588 0.0006587615
## 33             Cancer                 Lung              North America   9 0.0014684288 0.0059288538
## 34             Cancer           Lymph node              North America   2 0.0003263175 0.0013175231
## 35             Cancer          Oral cavity              North America  17 0.0027736988 0.0111989460
## 36             Cancer             Prostate     Europe &\nCentral Asia 142 0.0231685430 0.0935441370
## 37             Cancer                 Skin     Europe &\nCentral Asia   9 0.0014684288 0.0059288538
## 38             Cancer                 Skin              North America   6 0.0009789525 0.0039525692
## 39             Cancer              Stomach              North America  41 0.0066895089 0.0270092227
## 40             Cancer               Tonsil              North America   6 0.0009789525 0.0039525692
## 41             Cancer                 <NA>     Europe &\nCentral Asia   2 0.0003263175 0.0013175231
## 42             Cancer                 <NA>              North America  18 0.0029368576 0.0118577075
## 43     Cardiovascular                Blood       East Asia &\nPacific   6 0.0009789525 0.1052631579
## 44     Cardiovascular                Blood              North America  12 0.0019579050 0.2105263158
## 45     Cardiovascular         Blood vessel       East Asia &\nPacific  21 0.0034263338 0.3684210526
## 46     Cardiovascular                Heart     Europe &\nCentral Asia  15 0.0024473813 0.2631578947
## 47     Cardiovascular                Liver              North America   1 0.0001631588 0.0175438596
## 48     Cardiovascular               Testis                 South Asia   2 0.0003263175 0.0350877193
## 49          Endocrine              Thyroid       East Asia &\nPacific  10 0.0016315875 1.0000000000
## 50   Gastrointestinal                Blood       East Asia &\nPacific   5 0.0008157938 0.7142857143
## 51   Gastrointestinal            Intestine              North America   2 0.0003263175 0.2857142857
## 52   Genetic syndrome                Blood     Europe &\nCentral Asia  38 0.0062000326 0.6666666667
## 53   Genetic syndrome                 IPSC              North America  12 0.0019579050 0.2105263158
## 54   Genetic syndrome                 Nose     Europe &\nCentral Asia   7 0.0011421113 0.1228070175
## 55    Healthy control              Adipose              North America   1 0.0001631588 0.0004887586
## 56    Healthy control              Bladder              North America   1 0.0001631588 0.0004887586
## 57    Healthy control                Blood       East Asia &\nPacific  58 0.0094632077 0.0283479961
## 58    Healthy control                Blood     Europe &\nCentral Asia 190 0.0310001632 0.0928641251
## 59    Healthy control                Blood              North America 427 0.0696687877 0.2086999022
## 60    Healthy control         Blood vessel       East Asia &\nPacific  22 0.0035894926 0.0107526882
## 61    Healthy control          Bone marrow              North America  12 0.0019579050 0.0058651026
## 62    Healthy control                Brain       East Asia &\nPacific  10 0.0016315875 0.0048875855
## 63    Healthy control                Brain     Europe &\nCentral Asia   5 0.0008157938 0.0024437928
## 64    Healthy control                Brain              North America 302 0.0492739435 0.1476050831
## 65    Healthy control                Heart              North America   3 0.0004894763 0.0014662757
## 66    Healthy control                 IPSC       East Asia &\nPacific  20 0.0032631751 0.0097751711
## 67    Healthy control                 IPSC     Europe &\nCentral Asia 582 0.0949583945 0.2844574780
## 68    Healthy control                 IPSC Latin America &\nCaribbean  27 0.0044052863 0.0131964809
## 69    Healthy control                 IPSC              North America 194 0.0316527982 0.0948191593
## 70    Healthy control            Intestine              North America  80 0.0130527003 0.0391006843
## 71    Healthy control                 Lung     Europe &\nCentral Asia  18 0.0029368576 0.0087976540
## 72    Healthy control                 Lung              North America   1 0.0001631588 0.0004887586
## 73    Healthy control               Muscle     Europe &\nCentral Asia  16 0.0026105401 0.0078201369
## 74    Healthy control               Muscle              North America   1 0.0001631588 0.0004887586
## 75    Healthy control                 Nose     Europe &\nCentral Asia  10 0.0016315875 0.0048875855
## 76    Healthy control                 Skin       East Asia &\nPacific   4 0.0006526350 0.0019550342
## 77    Healthy control                 Skin     Europe &\nCentral Asia   8 0.0013052700 0.0039100684
## 78    Healthy control               Spleen              North America   1 0.0001631588 0.0004887586
## 79    Healthy control              Stomach              North America   1 0.0001631588 0.0004887586
## 80    Healthy control               Testis       East Asia &\nPacific   1 0.0001631588 0.0004887586
## 81    Healthy control               Testis                 South Asia   2 0.0003263175 0.0009775171
## 82    Healthy control               Thymus              North America   1 0.0001631588 0.0004887586
## 83    Healthy control              Thyroid     Europe &\nCentral Asia   5 0.0008157938 0.0024437928
## 84    Healthy control              Trachea              North America  12 0.0019579050 0.0058651026
## 85    Healthy control               Uterus       East Asia &\nPacific   3 0.0004894763 0.0014662757
## 86    Healthy control                 <NA>       East Asia &\nPacific   4 0.0006526350 0.0019550342
## 87    Healthy control                 <NA>              North America  24 0.0039158101 0.0117302053
## 88         Infectious                Blood     Europe &\nCentral Asia 208 0.0339370207 0.6172106825
## 89         Infectious                Blood              North America 115 0.0187632566 0.3412462908
## 90         Infectious          Bone marrow Latin America &\nCaribbean   2 0.0003263175 0.0059347181
## 91         Infectious                 Lung              North America   4 0.0006526350 0.0118694362
## 92         Infectious                 <NA>              North America   8 0.0013052700 0.0237388724
## 93      Integumentary                Blood       East Asia &\nPacific   7 0.0011421113 1.0000000000
## 94             Kidney                Blood       East Asia &\nPacific  17 0.0027736988 1.0000000000
## 95      Mental health              Adipose              North America   2 0.0003263175 0.0043010753
## 96      Mental health        Adrenal gland              North America   2 0.0003263175 0.0043010753
## 97      Mental health         Blood vessel              North America   2 0.0003263175 0.0043010753
## 98      Mental health                Brain              North America 281 0.0458476097 0.6043010753
## 99      Mental health      Digestive tract              North America   2 0.0003263175 0.0043010753
## 100     Mental health                Heart              North America   3 0.0004894763 0.0064516129
## 101     Mental health                 IPSC       East Asia &\nPacific   8 0.0013052700 0.0172043011
## 102     Mental health                 IPSC              North America 147 0.0239843368 0.3161290323
## 103     Mental health            Intestine              North America   4 0.0006526350 0.0086021505
## 104     Mental health                 Lung              North America   1 0.0001631588 0.0021505376
## 105     Mental health               Muscle              North America   2 0.0003263175 0.0043010753
## 106     Mental health                Ovary              North America   1 0.0001631588 0.0021505376
## 107     Mental health             Pancreas              North America   2 0.0003263175 0.0043010753
## 108     Mental health               Spleen              North America   2 0.0003263175 0.0043010753
## 109     Mental health              Stomach              North America   2 0.0003263175 0.0043010753
## 110     Mental health                 <NA>              North America   4 0.0006526350 0.0086021505
## 111         Metabolic        Adrenal gland              North America   1 0.0001631588 0.0042735043
## 112         Metabolic                Blood     Europe &\nCentral Asia   7 0.0011421113 0.0299145299
## 113         Metabolic                 IPSC              North America  24 0.0039158101 0.1025641026
## 114         Metabolic               Kidney              North America   2 0.0003263175 0.0085470085
## 115         Metabolic                Liver              North America 191 0.0311633219 0.8162393162
## 116         Metabolic                 Skin     Europe &\nCentral Asia   9 0.0014684288 0.0384615385
## 117 Neurodegenerative                Blood       East Asia &\nPacific  10 0.0016315875 0.0546448087
## 118 Neurodegenerative                Brain       East Asia &\nPacific  10 0.0016315875 0.0546448087
## 119 Neurodegenerative                Brain     Europe &\nCentral Asia   7 0.0011421113 0.0382513661
## 120 Neurodegenerative                Brain              North America 104 0.0169685104 0.5683060109
## 121 Neurodegenerative                  CNS              North America   7 0.0011421113 0.0382513661
## 122 Neurodegenerative                 IPSC     Europe &\nCentral Asia  12 0.0019579050 0.0655737705
## 123 Neurodegenerative                 IPSC Latin America &\nCaribbean  15 0.0024473813 0.0819672131
## 124 Neurodegenerative                 <NA>              North America  18 0.0029368576 0.0983606557
## 125      Neurological                Brain              North America   8 0.0013052700 0.5714285714
## 126      Neurological                 Skin       East Asia &\nPacific   6 0.0009789525 0.4285714286
## 127             Other                Blood       East Asia &\nPacific   7 0.0011421113 0.7777777778
## 128             Other         Blood vessel       East Asia &\nPacific   2 0.0003263175 0.2222222222
## 129      Reproductive               Testis                 South Asia  10 0.0016315875 0.5882352941
## 130      Reproductive               Uterus       East Asia &\nPacific   7 0.0011421113 0.4117647059
## 131       Respiratory                Blood       East Asia &\nPacific  20 0.0032631751 0.1574803150
## 132       Respiratory                Blood     Europe &\nCentral Asia  30 0.0048947626 0.2362204724
## 133       Respiratory                Blood              North America  54 0.0088105727 0.4251968504
## 134       Respiratory         Blood vessel       East Asia &\nPacific   5 0.0008157938 0.0393700787
## 135       Respiratory                 Lung     Europe &\nCentral Asia  18 0.0029368576 0.1417322835

And then we write the file out, for the joint plotting adventures…

saveRDS(allSRAFinal, file="20240901_allSRAFinal_for_plotting.rds")